我有这个用于矩阵乘法的C程序,我已经读过并试过循环展开......但除此之外我不知道如何加速C程序。有人能指出我还能做些什么吗?
const char* dgemm_desc = "Simple blocked dgemm.";
#if !defined(BLOCK_SIZE)
#define BLOCK_SIZE 41
#endif
#define min(a,b) (((a)<(b))?(a):(b))
/* This routine performs a dgemm operation
* C := C + A * B
* where A, B, and C are lda-by-lda matrices stored in column-major format.
* On exit, A and B maintain their input values. */
void square_dgemm (int lda, double* A, double* B, double* C)
{
/* For each block-row of A */
for (int i = 0; i < lda; i += BLOCK_SIZE)
/* For each block-column of B */
for (int j = 0; j < lda; j += BLOCK_SIZE)
/* Accumulate block dgemms into block of C */
for (int k = 0; k < lda; k += BLOCK_SIZE)
{
/* Correct block dimensions if block "goes off edge of" the matrix */
int M = min (BLOCK_SIZE, lda-i);
int N = min (BLOCK_SIZE, lda-j);
int K = min (BLOCK_SIZE, lda-k);
/* Perform individual block dgemm */
if( (M % BLOCK_SIZE <= 2) && (N % BLOCK_SIZE <= 2) && (K % BLOCK_SIZE <= 1 ))
{
do_block_fast(lda, M, N, K, A + i + k*lda, B+ k + j *lda, C + i + j*lda);
}else
{
do_block(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
}
}
主程序,调用子程序do_block_fast()和do_block()。
void do_block_fast (int lda, int M, int N, int K, double* A, double* B, double* C)
{
static double a[BLOCK_SIZE*BLOCK_SIZE] __attribute__ ((aligned (16)));
// make a local aligned copy of A's block
for( int j = 0; j < K; j++ )
for( int i = 0; i < M; i++ )
a[i+j*BLOCK_SIZE] = A[i+j*lda];
/* For each row i of A */
for (int i = 0; i < M; ++i)
/* For each column j of B */
for (int j = 0; j < N; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*lda];
//for (int k = 0; k < K; ++k){
// cij += a[i+k*BLOCK_SIZE] * B[k+j*lda];
// }
for (int k = 0; k < K; k+= 41)
{
cij += a[i+k*BLOCK_SIZE] * B[k+j*lda];
cij += a[i+(k+1)*BLOCK_SIZE] * B[(k+1)+j*lda];
cij += a[i+(k+2)*BLOCK_SIZE] * B[(k+2)+j*lda];
cij += a[i+(k+3)*BLOCK_SIZE] * B[(k+3)+j*lda];
cij += a[i+(k+4)*BLOCK_SIZE] * B[(k+4)+j*lda];
cij += a[i+(k+5)*BLOCK_SIZE] * B[(k+5)+j*lda];
cij += a[i+(k+6)*BLOCK_SIZE] * B[(k+6)+j*lda];
cij += a[i+(k+7)*BLOCK_SIZE] * B[(k+7)+j*lda];
cij += a[i+(k+8)*BLOCK_SIZE] * B[(k+8)+j*lda];
cij += a[i+(k+9)*BLOCK_SIZE] * B[(k+9)+j*lda];
cij += a[i+(k+10)*BLOCK_SIZE] * B[(k+10)+j*lda];
cij += a[i+(k+11)*BLOCK_SIZE] * B[(k+11)+j*lda];
cij += a[i+(k+12)*BLOCK_SIZE] * B[(k+12)+j*lda];
cij += a[i+(k+13)*BLOCK_SIZE] * B[(k+13)+j*lda];
cij += a[i+(k+14)*BLOCK_SIZE] * B[(k+14)+j*lda];
cij += a[i+(k+15)*BLOCK_SIZE] * B[(k+15)+j*lda];
cij += a[i+(k+16)*BLOCK_SIZE] * B[(k+16)+j*lda];
cij += a[i+(k+17)*BLOCK_SIZE] * B[(k+17)+j*lda];
cij += a[i+(k+18)*BLOCK_SIZE] * B[(k+18)+j*lda];
cij += a[i+(k+19)*BLOCK_SIZE] * B[(k+19)+j*lda];
cij += a[i+(k+20)*BLOCK_SIZE] * B[(k+20)+j*lda];
cij += a[i+(k+21)*BLOCK_SIZE] * B[(k+21)+j*lda];
cij += a[i+(k+22)*BLOCK_SIZE] * B[(k+22)+j*lda];
cij += a[i+(k+23)*BLOCK_SIZE] * B[(k+23)+j*lda];
cij += a[i+(k+24)*BLOCK_SIZE] * B[(k+24)+j*lda];
cij += a[i+(k+25)*BLOCK_SIZE] * B[(k+25)+j*lda];
cij += a[i+(k+26)*BLOCK_SIZE] * B[(k+26)+j*lda];
cij += a[i+(k+27)*BLOCK_SIZE] * B[(k+27)+j*lda];
cij += a[i+(k+28)*BLOCK_SIZE] * B[(k+28)+j*lda];
cij += a[i+(k+29)*BLOCK_SIZE] * B[(k+29)+j*lda];
cij += a[i+(k+30)*BLOCK_SIZE] * B[(k+30)+j*lda];
cij += a[i+(k+31)*BLOCK_SIZE] * B[(k+31)+j*lda];
cij += a[i+(k+32)*BLOCK_SIZE] * B[(k+32)+j*lda];
cij += a[i+(k+33)*BLOCK_SIZE] * B[(k+33)+j*lda];
...
...
}
C[i+j*lda] = cij;
}
}
子程序do_block_fast()...基本上因为我知道K = 41的值,所以我刚刚将最里面的for循环展开了41。
static void do_block (int lda, int M, int N, int K, double* A, double* B, double* C)
{
/* For each row i of A */
for (int i = 0; i < M; ++i)
/* For each column j of B */
for (int j = 0; j < N; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*lda];
if(K % 8 == 0)
for(int k = 0; k < K; k += 8){
cij += A[i+k*lda] * B[k+j*lda];
cij += A[i+(k+1)*lda] * B[(k+1)+j*lda];
cij += A[i+(k+2)*lda] * B[(k+2)+j*lda];
cij += A[i+(k+3)*lda] * B[(k+3)+j*lda];
cij += A[i+(k+4)*lda] * B[(k+4)+j*lda];
cij += A[i+(k+5)*lda] * B[(k+5)+j*lda];
cij += A[i+(k+6)*lda] * B[(k+6)+j*lda];
cij += A[i+(k+7)*lda] * B[(k+7)+j*lda];
}
else if(K % 7 == 0)
for(int k = 0; k < K; k += 7){
cij += A[i+k*lda] * B[k+j*lda];
cij += A[i+(k+1)*lda] * B[(k+1)+j*lda];
cij += A[i+(k+2)*lda] * B[(k+2)+j*lda];
cij += A[i+(k+3)*lda] * B[(k+3)+j*lda];
cij += A[i+(k+4)*lda] * B[(k+4)+j*lda];
cij += A[i+(k+5)*lda] * B[(k+5)+j*lda];
cij += A[i+(k+6)*lda] * B[(k+6)+j*lda];
}
else if(K % 6 == 0)
for(int k = 0; k < K; k += 6){
cij += A[i+k*lda] * B[k+j*lda];
cij += A[i+(k+1)*lda] * B[(k+1)+j*lda];
cij += A[i+(k+2)*lda] * B[(k+2)+j*lda];
cij += A[i+(k+3)*lda] * B[(k+3)+j*lda];
cij += A[i+(k+4)*lda] * B[(k+4)+j*lda];
cij += A[i+(k+5)*lda] * B[(k+5)+j*lda];
}
...
else
for (int k = 0; k < K; ++k)
cij += A[i+k*lda] * B[k+j*lda];
C[i+j*lda] = cij;
}
}
子程序do_block()。由于我不知道K的值,我只使用了分支语句,以展开最里面的for循环。
最后这是程序的结果
Size: 31 Mflop/s: 2361.82 Percentage: 5.51
Size: 32 Mflop/s: 2941.3 Percentage: 6.86
Size: 96 Mflop/s: 2861.72 Percentage: 6.67
Size: 97 Mflop/s: 2657.44 Percentage: 6.19
Size: 127 Mflop/s: 3028.53 Percentage: 7.06
Size: 128 Mflop/s: 2925.26 Percentage: 6.82
Size: 129 Mflop/s: 2823.6 Percentage: 6.58
Size: 191 Mflop/s: 2893.04 Percentage: 6.74
Size: 192 Mflop/s: 2925.12 Percentage: 6.82
Size: 229 Mflop/s: 2834.69 Percentage: 6.61
Size: 255 Mflop/s: 2963.94 Percentage: 6.91
Size: 256 Mflop/s: 2712.55 Percentage: 6.32
Size: 257 Mflop/s: 2905.3 Percentage: 6.77
Size: 319 Mflop/s: 2878.31 Percentage: 6.71
Size: 320 Mflop/s: 2855.04 Percentage: 6.66
Size: 321 Mflop/s: 2835.52 Percentage: 6.61
Size: 417 Mflop/s: 2971.67 Percentage: 6.93
Size: 479 Mflop/s: 2876.16 Percentage: 6.70
Size: 480 Mflop/s: 2813.88 Percentage: 6.56
Size: 511 Mflop/s: 2717.75 Percentage: 6.34
Size: 512 Mflop/s: 1740.14 Percentage: 4.06
Size: 639 Mflop/s: 2614.61 Percentage: 6.09
Size: 640 Mflop/s: 2453.03 Percentage: 5.72
Size: 767 Mflop/s: 2494.76 Percentage: 5.82
Size: 768 Mflop/s: 2389.77 Percentage: 5.57
Size: 769 Mflop/s: 2779.2 Percentage: 6.48
Average percentage of Peak = 6.38822