Re: NXC: resolved - Matrix algebra library
Posted: 13 Jul 2011, 09:07
Linusa,
The issue is that accessing elements using [j] isn't very efficient so accessing by [j] will be just as bad. (At least, not anywhere near the performance I'm looking for...) And there is performance differences, if you want to access [j+1] afterwards this will be more efficient in NXC than doing [i+1][j]. (In NXC multidimensional arrays should be treated as arrays of array pointers and not a linear clump of memory.)
Well, I just optimized the function instead and made it 3 times as fast.
You can see every step in the optimization process, so if interested you can try each out and see how much/how little they matters.
One functional change in the last modification, it automatically resets C to the right array size. If you also want to reduce memory requirements while the function is not in use, add the following at the very end:
The issue is that accessing elements using [j] isn't very efficient so accessing by [j] will be just as bad. (At least, not anywhere near the performance I'm looking for...) And there is performance differences, if you want to access [j+1] afterwards this will be more efficient in NXC than doing [i+1][j]. (In NXC multidimensional arrays should be treated as arrays of array pointers and not a linear clump of memory.)
Well, I just optimized the function instead and made it 3 times as fast.
Code: Select all
/* Multiplies Matrices A and B and stores the result in C.
Size of A is [N][M] and size of B is [M][K]
*/
/*/ Original
void MatrixMatrixMult(float A[][], float B[][], float &C[][], int N, int M, int K){
int i, j, s;
for (i=0; i<N; ++i) {
for (j=0; j<K; ++j) {
C[i][j]=0;
for (s=0; s<M; ++s) {
C[i][j]=C[i][j] + A[i][s]*B[s][j];
}
}
}
}
//*/
/*/ Just optimize array-lookups
void MatrixMatrixMult(float A[][], float B[][], float &C[][], int N, int M, int K){
for( unsigned int i=0; i<N; ++i ){
//Speed up arrays
float A_i[]; A_i = A[i]; //Compiler bug
float C_i[]; C_i = C[i];
for( unsigned int j=0; j<K; ++j ){
float sum = 0;
for( unsigned int s=0; s<M; ++s ){
sum += A_i[s]*B[s][j];
}
C_i[j] = sum;
}
C[i] = C_i;
}
}
//*/
/*/ Optimize loops
void MatrixMatrixMult(float A[][], float B[][], float &C[][], int N, int M, int K){
int i = N; lbl_Mult_start_i: i--; //Loop start
//Speed up arrays
float A_i[]; A_i = A[i]; //Compiler bug
float C_i[]; C_i = C[i];
int j = K;
lbl_Mult_start_j: j--; //Second loop start
float sum = 0;
int s = M; lbl_Mult_start_s: s--; //Third loop start
sum += A_i[s]*B[s][j];
asm{ brtst GT, lbl_Mult_start_s, s }
C_i[j] = sum;
asm{ brtst GT, lbl_Mult_start_j, j }
C[i] = C_i;
asm{ brtst GT, lbl_Mult_start_i, i }
}
//*/
/*/ Optimize index and replace
void MatrixMatrixMult(float A[][], float B[][], float &C[][], int N, int M, int K){
int i = N; lbl_Mult_start_i: i--; //Loop start
//Speed up arrays
float A_i[];
float C_i[];
asm{ index A_i, A, i }
asm{ index C_i, C, i }
int j = K; lbl_Mult_start_j: j--; //Second loop start
float sum = 0;
int s = M; lbl_Mult_start_s: s--; //Third loop start
float arr_temp, sum_temp;
asm{ index sum_temp, A_i, s }
float arr_temp2[];
asm{ index arr_temp2, B, s }
asm{ index arr_temp, arr_temp2, j }
sum += sum_temp * arr_temp;
asm{ brtst GT, lbl_Mult_start_s, s }
asm{ replace C_i, C_i, j, sum }
asm{ brtst GT, lbl_Mult_start_j, j }
asm{ replace C, C, i, C_i }
asm{ brtst GT, lbl_Mult_start_i, i }
}
//*/
//*/ Minor optimizations and improvements
void MatrixMatrixMult(float A[][], float B[][], float &C[][], int N, int M, int K){
float arr_temp[];
//Set size of C
ArrayInit( arr_temp, 0, K );
ArrayInit( C, arr_temp, N );
lbl_Mult_start_i: N--; //Loop start
//Speed up arrays
float A_i[];
float C_i[];
asm{ index A_i, A, N }
asm{ index C_i, C, N }
int j = K; lbl_Mult_start_j: j--; //Second loop start
float sum = 0;
int s = M; lbl_Mult_start_s: s--; //Third loop start
//sum += A_i[s]*B[s][j];
float temp, sum_temp;
asm{ index sum_temp, A_i, s }
asm{ index arr_temp, B, s }
asm{ index temp, arr_temp, j }
sum_temp *= temp;
sum += sum_temp;
asm{ brtst GT, lbl_Mult_start_s, s }
asm{ replace C_i, C_i, j, sum }
asm{ brtst GT, lbl_Mult_start_j, j }
asm{ replace C, C, N, C_i }
asm{ brtst GT, lbl_Mult_start_i, N }
}
//*/
task main() {
float A[10][7], B[7][12], C[10][12];
//Initialize variables
A[0][0]=1; A[0][1]=2; A[0][2]=3;
A[1][0]=3; A[1][1]=4; A[1][2]=4;
A[2][0]=4; A[2][1]=7; A[2][2]=1;
B[0][0]=3; B[0][1]=1; B[0][2]=4;
B[1][0]=7; B[1][1]=6; B[1][2]=8;
B[2][0]=4; B[2][1]=5; B[2][2]=9;
TextOut(0,40,"Mul");
unsigned long t0 = CurrentTick();
repeat( 100 )
MatrixMatrixMult(A,B,C, 10,7,12);
NumOut(0,58, CurrentTick() - t0 ); //Show time taken
//Show results
NumOut(0,32,C[0][0]); NumOut(30,32,C[0][1]); NumOut(60,32,C[0][2]);
NumOut(0,24,C[1][0]); NumOut(30,24,C[1][1]); NumOut(60,24,C[1][2]);
NumOut(0,16,C[2][0]); NumOut(30,16,C[2][1]); NumOut(60,16,C[2][2]);
while(1);
}
One functional change in the last modification, it automatically resets C to the right array size. If you also want to reduce memory requirements while the function is not in use, add the following at the very end:
Code: Select all
ArrayInit( arr_temp, 0, 0 );
ArrayInit( A_i, 0, 0 );
ArrayInit( C_i, 0, 0 );