共享内存矩阵乘法内核

时间:2012-12-28 23:47:17

标签: c cuda parallel-processing gpu shared-memory

我正在尝试实现一个基于共享内存的矩阵乘法内核,如CUDA C编程指南中所述。以下是内核:

 __global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float * CSub = &C[CColumns * 16 * blockIdx.y + 16 * blockIdx.x];
     float CValue = 0;
 for (int k = 0; k < (AColumns / 16); ++k) {
         float * ASub =  &A[AColumns * 16 * blockIdx.y + 16 * k];
         float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
         __shared__ float As[16][16];
         __shared__ float Bs[16][16];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < 16; ++n)
        CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
     }
     CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
 }

以下是对内核的调用:

 dim3 dimBlock(16, 16, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

不幸的是,这似乎产生了错误的结果。

非常感谢任何协助/解释。

1 个答案:

答案 0 :(得分:5)

你的内核中至少有2个基本错误,两者都相对微不足道。你有这个:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.y];

你应该用这个:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.x];

你有这个:

 CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;

你应该用这个:

 CSub[threadIdx.y*CColumns+threadIdx.x]=CValue;

这可以让您在以下条件下获得基本正确性:

  1. square matrices
  2. 矩阵尺寸可被瓷砖尺寸整除
  3. 修复方阵限制并不困难。修复tile维度的维度限制需要对内核进行大量更改,以便:

    1. 不处理超出范围的元素
    2. 使用适合“边框”区域的值正确填充共享内存区域
    3. 由于您的代码无法理解这一点,我不确定您是否在询问它并选择不专门解决这些问题。

      我能够将您的代码的以下改编作为基本示例: (请注意,为了减少代码大小,我省略了CUDA error checking。请不要将此作为良好编码的代表性示例。请进行适当的错误检查。我的答案是不是要解释好的CUDA错误检查,而是要显示一个算法正确的例子。)

      #include <stdio.h>
      #include <math.h>
      #define TILE_DIM 16
      #define DIMX 256
      #define DIMY 256
      #define RES 0.1
      
      __global__ void matrixMultiplyShared(float * A, float * B, float * C,
                           int ARows, int AColumns,
                           int BRows, int BColumns,
                           int CRows, int CColumns) {
           float CValue = 0;
           if (((blockIdx.y * blockDim.y + threadIdx.y)< CRows) && ((blockIdx.x * blockDim.x + threadIdx.x) < CColumns)) {
             for (int k = 0; k < (AColumns / TILE_DIM); ++k) {
               float * ASub =  &A[AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k];
               float * BSub = &B[AColumns*TILE_DIM*k + TILE_DIM*blockIdx.x];
               __shared__ float As[TILE_DIM][TILE_DIM];
               __shared__ float Bs[TILE_DIM][TILE_DIM];
               As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
               Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
               __syncthreads();
               for (int n = 0; n < TILE_DIM; ++n)
               CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
               __syncthreads();
             }
             C[((blockIdx.y * blockDim.y + threadIdx.y)*CColumns)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
           }
       }
      
      
      void matrixMultiplyCPU(float * A, float * B, float * C,
                           int ARows, int AColumns,
                           int BRows, int BColumns,
                           int CRows, int CColumns) {
        for (int i = 0; i<ARows; i++)
          for (int j=0; j<BColumns; j++){
            float Ctemp = 0.0;
            for (int k=0; k<AColumns; k++)
              Ctemp += A[i*AColumns + k] * B[k*BColumns+j];
            C[i*CColumns+j] = Ctemp;
            }
      
      }
      int main(){
       int CColumns = DIMY, CRows=DIMX, AColumns=DIMY, ARows=DIMX, BColumns=DIMY, BRows=DIMX;
       dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
       dim3 dimGrid;
       dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
       dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
       float *deviceA, *deviceB, *deviceC;
       float hostA[DIMY][DIMX];
       float hostB[DIMY][DIMX];
       float hostC[DIMY][DIMX];
       float hostCp[DIMY][DIMX];
       for (int x = 0; x<DIMX; x++)
         for (int y = 0; y<DIMY; y++) {
           hostA[y][x] = rand()/(float)RAND_MAX;
           hostB[y][x] = rand()/(float)RAND_MAX;
           }
        cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
        cudaMalloc((void **)&deviceB, DIMX*DIMY*sizeof(float));
        cudaMalloc((void **)&deviceC, DIMX*DIMY*sizeof(float));
        cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(deviceB, hostB, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
        matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
        cudaMemcpy(hostC, deviceC, DIMX*DIMY*sizeof(float), cudaMemcpyDeviceToHost);
        matrixMultiplyCPU(&(hostA[0][0]) , &(hostB[0][0]) , &(hostCp[0][0]) , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
      
       for (int y = 0; y<DIMY; y++)
         for (int x = 0; x<DIMX; x++)
           if (fabs(hostCp[y][x] - hostC[y][x]) > RES)
             {
             printf("Error at offset y=%d,x=%d, CPU = %f, GPU = %f\n", y, x, hostCp[y][x], hostC[y][x]);
             return 1;
             }
       printf("Finished!\n");
       return 0;
      }