Question

我正在尝试加速多核架构上的矩阵乘法。为此，我尝试同时使用线程和SIMD。但我的结果并不好。我通过顺序矩阵乘法测试加速：

void sequentialMatMul(void* params)
{
    cout << "SequentialMatMul started.";
    int i, j, k;
    for (i = 0; i < N; i++)
    {
        for (k = 0; k < N; k++)
        {
            for (j = 0; j < N; j++)
            {
                X[i][j] += A[i][k] * B[k][j];
            }
        }
    }
    cout << "\nSequentialMatMul finished.";
}

我尝试将线程和SIMD添加到矩阵乘法中，如下所示：

void threadedSIMDMatMul(void* params)
{
    bounds *args = (bounds*)params;
    int lowerBound = args->lowerBound;
    int upperBound = args->upperBound;
    int idx = args->idx;

    int i, j, k;
    for (i = lowerBound; i <upperBound; i++)
    {
        for (k = 0; k < N; k++)
        {
            for (j = 0; j < N; j+=4)
            {
                mmx1 = _mm_loadu_ps(&X[i][j]);
                mmx2 = _mm_load_ps1(&A[i][k]);
                mmx3 = _mm_loadu_ps(&B[k][j]);
                mmx4 = _mm_mul_ps(mmx2, mmx3);
                mmx0 = _mm_add_ps(mmx1, mmx4);
                _mm_storeu_ps(&X[i][j], mmx0);
            }
        }
    }
    _endthread();
}

以下部分用于计算每个线程的下限和上限：

bounds arg[CORES];
for (int part = 0; part < CORES; part++)
{
    arg[part].idx = part;
    arg[part].lowerBound = (N / CORES)*part;
    arg[part].upperBound = (N / CORES)*(part + 1);
}

最后，线程SIMD版本就像这样调用：

HANDLE  handle[CORES];      
for (int part = 0; part < CORES; part++)
{
    handle[part] = (HANDLE)_beginthread(threadedSIMDMatMul, 0, (void*)&arg[part]);
}
for (int part = 0; part < CORES; part++)
{
WaitForSingleObject(handle[part], INFINITE);
}

结果如下：测试1：

// arrays are defined as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=1//just one thread

连续时间：11129ms

螺纹SIMD matmul时间：14650ms

加速= 0.75x

测试2：

//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
    A[k] = (float*)malloc(cols * sizeof(float));
    B[k] = (float*)malloc(cols * sizeof(float));
    X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=1//just one thread

连续时间：15907毫秒

螺纹SIMD matmul时间：18578ms

加速= 0.85x

测试3：

//defined arrays as follow
float A[N][N];
float B[N][N];
float X[N][N];
N=2048
Core=2

连续时间：10855ms

螺纹SIMD matmul时间：27967ms

加速= 0.38x

测试4：

//defined arrays as follow
float **A = (float**)_aligned_malloc(N* sizeof(float), 16);
float **B = (float**)_aligned_malloc(N* sizeof(float), 16);
float **X = (float**)_aligned_malloc(N* sizeof(float), 16);
for (int k = 0; k < N; k++)
{
    A[k] = (float*)malloc(cols * sizeof(float));
    B[k] = (float*)malloc(cols * sizeof(float));
    X[k] = (float*)malloc(cols * sizeof(float));
}
N=2048
Core=2

连续时间：16579毫秒

螺纹SIMD matmul时间：30160ms

加速= 0.51x

我的问题：为什么我没有加快速度？

Answer 1

Here are the times I get building on your algorithm on my four core i7 IVB processor.

sequential:         3.42 s
4 threads:          0.97 s
4 threads + SSE:    0.86 s

Here are the times on a 2 core P9600 @2.53 GHz which is similar to the OP's E2200 @2.2 GHz

sequential: time    6.52 s
2 threads: time     3.66 s
2 threads + SSE:    3.75 s

I used OpenMP because it makes this easy. Each thread in OpenMP runs over effectively

lowerBound = N*part/CORES;
upperBound = N*(part + 1)/CORES;

(note that that is slightly different than your definition. Your definition can give the wrong result due to rounding for some values of N since you divide by CORES first.)

As to the SIMD version. ~~It's not much faster probably due it being memory bandwidth bound~~ . It's probably not really faster because GCC already vectroizes the loop.

The most optimal solution is much more complicated. You need to use loop tiling and reorder the elements within tiles to get the optimal performance. I don't have time to do that today.

Here is the code I used:

//c99 -O3 -fopenmp -Wall foo.c
#include <stdio.h>
#include <string.h>
#include <x86intrin.h>
#include <omp.h>

void gemm(float * restrict a, float * restrict b, float * restrict c, int n) {
    for(int i=0; i<n; i++) {
        for(int k=0; k<n; k++) {
            for(int j=0; j<n; j++) {
                c[i*n+j] += a[i*n+k]*b[k*n+j];
            }
        }
    }
}

void gemm_tlp(float * restrict a, float * restrict b, float * restrict c, int n) {
    #pragma omp parallel for
    for(int i=0; i<n; i++) {
        for(int k=0; k<n; k++) {
            for(int j=0; j<n; j++) {
                c[i*n+j] += a[i*n+k]*b[k*n+j];
            }
        }
    }
}   

void gemm_tlp_simd(float * restrict a, float * restrict b, float * restrict c, int n) {
    #pragma omp parallel for
    for(int i=0; i<n; i++) {
        for(int k=0; k<n; k++) {
            __m128 a4 = _mm_set1_ps(a[i*n+k]);
            for(int j=0; j<n; j+=4) {
                __m128 c4 = _mm_load_ps(&c[i*n+j]);
                __m128 b4 = _mm_load_ps(&b[k*n+j]);
                c4 = _mm_add_ps(_mm_mul_ps(a4,b4),c4);
                _mm_store_ps(&c[i*n+j], c4);
            }
        }
    }
}

int main(void) {
    int n = 2048;
    float *a = _mm_malloc(n*n * sizeof *a, 64);
    float *b = _mm_malloc(n*n * sizeof *b, 64);
    float *c1 = _mm_malloc(n*n * sizeof *c1, 64);
    float *c2 = _mm_malloc(n*n * sizeof *c2, 64);
    float *c3 = _mm_malloc(n*n * sizeof *c2, 64);
    for(int i=0; i<n*n; i++) a[i] = 1.0*i;
    for(int i=0; i<n*n; i++) b[i] = 1.0*i;
    memset(c1, 0, n*n * sizeof *c1);
    memset(c2, 0, n*n * sizeof *c2);
    memset(c3, 0, n*n * sizeof *c3);
    double dtime;

    dtime = -omp_get_wtime();
    gemm(a,b,c1,n);
    dtime += omp_get_wtime();
    printf("time %f\n", dtime);

    dtime = -omp_get_wtime();
    gemm_tlp(a,b,c2,n);
    dtime += omp_get_wtime();
    printf("time %f\n", dtime);

    dtime = -omp_get_wtime();
    gemm_tlp_simd(a,b,c3,n);
    dtime += omp_get_wtime();
    printf("time %f\n", dtime);
    printf("error %d\n", memcmp(c1,c2, n*n*sizeof *c1));
    printf("error %d\n", memcmp(c1,c3, n*n*sizeof *c1));
}

Answer 2

It looks to me that the threads are sharing __m128 mmx* variables, you probably defined them global/static. You must be getting wrong results in your X array too. Define __m128 mmx* variables inside threadedSIMDMatMul function scope and it will run much faster.

void threadedSIMDMatMul(void* params)
{
    __m128 mmx0, mmx1, mmx2, mmx3, mmx4;
    // rest of the code here
}

通过线程和SIMD并行化矩阵乘法

2 个答案: