我正在尝试加速多核架构上的矩阵乘法。为此,我尝试同时使用线程和SIMD。但我的结果并不好。我通过顺序矩阵乘法测试加速:
void sequentialMatMul(void* params)
{
cout << "SequentialMatMul started.";
int i, j, k;
for (i = 0; i < N; i++)
{
for (k = 0; k < N; k++)
{
for (j = 0; j < N; j++)
{
X[i][j] += A[i][k] * B[k][j];
}
}
}
cout << "\nSequentialMatMul finished.";
}
我尝试将线程和SIMD添加到矩阵乘法中,如下所示:
void threadedSIMDMatMul(void* params)
{
bounds *args = (bounds*)params;
int lowerBound = args->lowerBound;
int upperBound = args->upperBound;
int idx = args->idx;
int i, j, k;
for (i = lowerBound; i <upperBound; i++)
{
for (k = 0; k < N; k++)
{
for (j = 0; j < N; j+=4)
{
mmx1 = _mm_loadu_ps(&X[i][j]);
mmx2 = _mm_load_ps1(&A[i][k]);
mmx3 = _mm_loadu_ps(&B[k][j]);
mmx4 = _mm_mul_ps(mmx2, mmx3);
mmx0 = _mm_add_ps(mmx1, mmx4);
_mm_storeu_ps(&X[i][j], mmx0);
}
}
}
_endthread();
}
以下部分用于计算每个线程的下限和上限:
bounds arg[CORES];
for (int part = 0; part < CORES; part++)
{
arg[part].idx = part;
arg[part].lowerBound = (N / CORES)*part;
arg[part].upperBound = (N / CORES)*(part + 1);
}
最后,线程SIMD版本就像这样调用:
HANDLE handle[CORES];
for (int part = 0; part < CORES; part++)
{
handle[part] = (HANDLE)_beginthread(threadedSIMDMatMul, 0, (void*)&arg[part]);
}
for (int part = 0; part < CORES; part++)
{
WaitForSingleObject(handle[part], INFINITE);
}
结果如下: 测试1:
// arrays are defined as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=1//just one thread
连续时间:11129ms
螺纹SIMD matmul时间:14650ms
加速= 0.75x
测试2:
//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
A[k] = (float*)malloc(cols * sizeof(float));
B[k] = (float*)malloc(cols * sizeof(float));
X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=1//just one thread
连续时间:15907毫秒
螺纹SIMD matmul时间:18578ms
加速= 0.85x
测试3:
//defined arrays as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=2
连续时间:10855ms
螺纹SIMD matmul时间:27967ms
加速= 0.38x
测试4:
//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
A[k] = (float*)malloc(cols * sizeof(float));
B[k] = (float*)malloc(cols * sizeof(float));
X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=2
连续时间:16579毫秒
螺纹SIMD matmul时间:30160ms
加速= 0.51x
我的问题:为什么我没有加快速度?
答案 0 :(得分:5)
Here are the times I get building on your algorithm on my four core i7 IVB processor.
sequential: 3.42 s
4 threads: 0.97 s
4 threads + SSE: 0.86 s
Here are the times on a 2 core P9600 @2.53 GHz which is similar to the OP's E2200 @2.2 GHz
sequential: time 6.52 s
2 threads: time 3.66 s
2 threads + SSE: 3.75 s
I used OpenMP because it makes this easy. Each thread in OpenMP runs over effectively
lowerBound = N*part/CORES;
upperBound = N*(part + 1)/CORES;
(note that that is slightly different than your definition. Your definition can give the wrong result due to rounding for some values of N
since you divide by CORES
first.)
As to the SIMD version. It's not much faster probably due it being memory bandwidth bound . It's probably not really faster because GCC already vectroizes the loop.
The most optimal solution is much more complicated. You need to use loop tiling and reorder the elements within tiles to get the optimal performance. I don't have time to do that today.
Here is the code I used:
//c99 -O3 -fopenmp -Wall foo.c
#include <stdio.h>
#include <string.h>
#include <x86intrin.h>
#include <omp.h>
void gemm(float * restrict a, float * restrict b, float * restrict c, int n) {
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
for(int j=0; j<n; j++) {
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
}
}
}
void gemm_tlp(float * restrict a, float * restrict b, float * restrict c, int n) {
#pragma omp parallel for
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
for(int j=0; j<n; j++) {
c[i*n+j] += a[i*n+k]*b[k*n+j];
}
}
}
}
void gemm_tlp_simd(float * restrict a, float * restrict b, float * restrict c, int n) {
#pragma omp parallel for
for(int i=0; i<n; i++) {
for(int k=0; k<n; k++) {
__m128 a4 = _mm_set1_ps(a[i*n+k]);
for(int j=0; j<n; j+=4) {
__m128 c4 = _mm_load_ps(&c[i*n+j]);
__m128 b4 = _mm_load_ps(&b[k*n+j]);
c4 = _mm_add_ps(_mm_mul_ps(a4,b4),c4);
_mm_store_ps(&c[i*n+j], c4);
}
}
}
}
int main(void) {
int n = 2048;
float *a = _mm_malloc(n*n * sizeof *a, 64);
float *b = _mm_malloc(n*n * sizeof *b, 64);
float *c1 = _mm_malloc(n*n * sizeof *c1, 64);
float *c2 = _mm_malloc(n*n * sizeof *c2, 64);
float *c3 = _mm_malloc(n*n * sizeof *c2, 64);
for(int i=0; i<n*n; i++) a[i] = 1.0*i;
for(int i=0; i<n*n; i++) b[i] = 1.0*i;
memset(c1, 0, n*n * sizeof *c1);
memset(c2, 0, n*n * sizeof *c2);
memset(c3, 0, n*n * sizeof *c3);
double dtime;
dtime = -omp_get_wtime();
gemm(a,b,c1,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
dtime = -omp_get_wtime();
gemm_tlp(a,b,c2,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
dtime = -omp_get_wtime();
gemm_tlp_simd(a,b,c3,n);
dtime += omp_get_wtime();
printf("time %f\n", dtime);
printf("error %d\n", memcmp(c1,c2, n*n*sizeof *c1));
printf("error %d\n", memcmp(c1,c3, n*n*sizeof *c1));
}
答案 1 :(得分:1)
It looks to me that the threads are sharing __m128 mmx*
variables, you probably defined them global/static. You must be getting wrong results in your X array too. Define __m128 mmx*
variables inside threadedSIMDMatMul
function scope and it will run much faster.
void threadedSIMDMatMul(void* params)
{
__m128 mmx0, mmx1, mmx2, mmx3, mmx4;
// rest of the code here
}