我正在执行复杂双打(ZGEMM)的矩阵矩阵乘法,并且使用SSE的思想会有所帮助,但事实上它并没有减慢代码速度。我想知道它是否可能因为它的内存限制了吗?
下面是伪代码:
为了将两个复数双精度相乘,我使用以下内容,正如intel提出的那样(假设实数和复数是连续存储的):
如果M =(a + ib)且IN =(c + id):
M1 = _mm_loaddup_pd(&M[0]);//M1->|a|a|
I1 = _mm_load_pd(&IN);//I1->|d|c|
T1 = _mm_mul_pd(M1,I1);//T1->|a*d|a*c|
M1 = _mm_loaddup_pd(&M[1]);//M1->|b|b|
I1 = _mm_shuffle_pd(I1,I1,1);//I1->|c|d|
I1 = _mm_mul_pd(M1,I1);//I1->|b*c|b*d|
T1 = _mm_addsub_pd(T1,I1); //T1-> |ad+bc|ac-bd|
因此T1存储复数乘法的结果。
这是矩阵乘法(C[i][j] += A[i][k]*B[k][j]
):
/*Assumes real and imaginary elements are stored contiguously*/
/*Used loop order: ikj for better cache locality and used 2*2 block matrix mult*/
for(i=0;i<N;i+=2){
for(k=0;k<N;k+=2){
/*Perform the _mm_loaddup() part here for A[i][k],A[i][k+1],A[i+1][k],A[i+1][k+1] since im blocking for 2*2 matrix mult i.e will load duplicates of 8 double values into 8 SIMD registers here*/
A00r = _mm_loaddup_pd(&A[(i*N+k)*2+0]);
A00i = _mm_loaddup_pd(&A[(i*N+k)*2+1]);
A01r = _mm_loaddup_pd(&A[(i*N+k)*2+2]);
A01i = _mm_loaddup_pd(&A[(i*N+k)*2+3]);
A10r = _mm_loaddup_pd(&A[((i+1)*N+k)*2+0]);
A10i = _mm_loaddup_pd(&A[((i+1)*N+k)*2+1);
A11r = _mm_loaddup_pd(&A[((i+1)*N+k)*2+2);
A11i = _mm_loaddup_pd(&A[((i+1)*N+k)*2+2);
for(j=0;j<N;j+=2){
double out[8] = {0,0,0,0,0,0,0,0};
op00=op01=op10=op11=_mm_setzero_pd();
B00 = _mm_loadu_pd(&B[(k*N+j)*2]);
B00_r = _mm_shuffle_pd(B00,B00,1);
B01 = _mm_loadu_pd(&B[(k*N+j+1)*2]);
B01_r = _mm_shuffle_pd(B01,B01,1);
/*Perform A[i][k]*B[k][j], A[i][k]*B[k][j+1], A[i+1][k]*B[k][j], A[i+1][k]*B[k][j+1] and assign it to op00,op01,op10,op11 respectively -> takes 8 _mm_mul_pd() and 4 _mm_addsub_pd()*/
T1 = _mm_mul_pd(A00r,B00);
T2 = _mm_mul_pd(A00i,B00_r);
op00 = _mm_addsub_pd(T1,T2);
T1 = _mm_mul_pd(A00r,B01);
T2 = _mm_mul_pd(A00i,B01_r);
op01 = _mm_addsub_pd(T1,T2);
T1 = _mm_mul_pd(A10r,B00);
T2 = _mm_mul_pd(A10i,B00_r);
op10 = _mm_addsub_pd(T1,T2);
T1 = _mm_mul_pd(A10r,B01);
T2 = _mm_mul_pd(A10i,B01_r);
op11 = _mm_addsub_pd(T1,T2);
B00 = _mm_loadu_pd(&B[((k+1)*N+j)*2]);
B00_r = _mm_shuffle_pd(B00,B00,1);
B01 = _mm_loadu_pd(&B[((k+1)*N+j+1)*2]);
B00_r = _mm_shuffle_pd(B01,B01,1);
/*Perform A[i][k+1]*B[k+1][j],A[i][k+1]*B[k+1][j+1],A[i+1][k+1]*B[k+1][j],A[i+1][k+1]*B[k+1][j+1] and add it to op00,op01,op10,op11 respectively-> takes 8 _mm_mul_pd(), 4 _mm_add_pd(), 4 _mm_addsub_pd()*/
T1 = _mm_mul_pd(A01r,B10);
T2 = _mm_mul_pd(A01i,B10_r);
op00 = _mm_add_pd(op00,_mm_addsub_pd(T1,T2));
T1 = _mm_mul_pd(A01r,B11);
T2 = _mm_mul_pd(A01i,B11_r);
op01 = _mm_add_pd(op01,_mm_addsub_pd(T1,T2));
T1 = _mm_mul_pd(A11r,B10);
T2 = _mm_mul_pd(A11i,B10_r);
op10 = _mm_add_pd(op10,_mm_addsub_pd(T1,T2));
T1 = _mm_mul_pd(A11r,B11);
T2 = _mm_mul_pd(A11i,B11_r);
op11 = _mm_add_pd(op11,_mm_addsub_pd(T1,T2));
/*Store op00,op01,op10,op11 into out[0],out[2],out[4] and out[6] -> 4 stores*/
_mm_storeu_pd(&out[0],op00);
_mm_storeu_pd(&out[2],op01);
_mm_storeu_pd(&out[4],op10);
_mm_storeu_pd(&out[6],op11);
/*Perform the following 8 operations*/
C[(i*N+j)*2+0] += out[0];
C[(i*N+j)*2+1] += out[1];
.
.
.
C[((i+1)*N+j)*2+3] += out[7];
}
}
}
L1缓存是32KB,所以我也使用缓存阻塞(瓦片大小为16 * 16,使工作集大小为12KB(3 * 2 ^ 4 * 2 ^ 4 * 2 ^ 3 * 2))但它没有多大帮助。我只获得理论峰值性能的50%左右。关于如何改进这个的任何指示?