无法在CUDA中获得矩阵*向量乘法比在CPU中更快

时间:2016-04-25 22:41:36

标签: cuda

#include <iostream>
#include <assert.h>
#include <sys/time.h>

#define BLOCK_SIZE 32 // CUDA block size

__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
    if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
    return 0;
}

__device__ inline int getValFromVector(int* vector, int row, int matSize) {
    if (row<matSize) {return vector[row];}
    return 0;
}

__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
    __shared__ int aRowShared[BLOCK_SIZE];
    __shared__ int bShared[BLOCK_SIZE];
    __shared__ int myRow;
    __shared__ double rowSum;

    int myIndexInBlock = threadIdx.x;
    myRow = blockIdx.x;
    rowSum = 0;

    for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
        aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
        bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);

        __syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared

        if (myIndexInBlock==0) {
            for (int k=0;k<BLOCK_SIZE;k++) {
                rowSum += aRowShared[k] * bShared[k];
            }
        }
    }

    if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}

static inline void cudaCheckReturn(cudaError_t result) {
    if (result != cudaSuccess) {
        std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
        assert(result == cudaSuccess);
    }
}

static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
    matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row

    cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}


static void matVecMult(int** A,int* b, int* c, int matSize) {
    // Sequential implementation:
    for (int i=0;i<matSize;i++) {
        c[i]=0;
        for (int j=0;j<matSize;j++) {
            c[i]+=(A[i][j] * b[j]);
        }
    }
}

int main() {
    int matSize = 1000;

    int** A,* b,* c;
    int* aOnGPU,* bOnGPU,* cOnGPU;

    A = new int*[matSize];
    for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
    b = new int[matSize]();
    c = new int[matSize]();

    int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);

    cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
    cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
    cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));

    srand(time(NULL));

    for (int i=0;i<matSize;i++) {
        b[i] = rand()%100;
        for (int j=0;j<matSize;j++) {
            A[i][j] = rand()%100;
        }
    }

    for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
    cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));

    int iters=1;
    timeval start,end;

    // Sequential run:
    gettimeofday(&start,NULL);
    for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
    gettimeofday(&end,NULL);
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;

    // CUDA run:
    gettimeofday(&start,NULL);
    for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
    gettimeofday(&end,NULL);
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;

    cudaCheckReturn(cudaFree(aOnGPU));
    cudaCheckReturn(cudaFree(bOnGPU));
    cudaCheckReturn(cudaFree(cOnGPU));


    for (int i = 0; i < matSize; ++i) {
        delete[] A[i];
    }
    delete[] A;
    delete[] b;
    delete[] c;
}

给出:

267171
580253

我已经按照http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory上的指南,介绍了如何进行矩阵乘法。我使用共享内存用于矩阵(A)和向量(B),但无论我选择什么矩阵大小(100 * 100-20000 * 20000)或块大小(32-1024),顺序实现总是优于CUDA在速度方面的实现速度大约是其两倍。

由于我使用矩阵*向量乘法,共享数组和块的处理方式略有不同;我在矩阵的每一行使用一个块而不是在矩阵的一部分上使用2D块。

我的实现是错误的,还是只是CUDA不比CPU快?

1 个答案:

答案 0 :(得分:3)

第一项:您在cuda实现中执行检查,而不是在CPU上。在GPU上分支是非常昂贵的。

第二:你在cuda表演中计算cudamemcpy。在将结果返回到cpu之前,只执行一次乘法是非常罕见的。 通常(例如在CG上),您需要在复制之前在GPU上执行数百次乘法。

第三:不要试图实现这一点(除了教育目的)并使用供应商库(如CUBLAS,每个CUDA版本附带),这些都非常难以超越。