Question

我最近开始开始使用CUDA处理GPU。作为入门程序，我试图有效地实现简单的矩阵乘法

C = AB

，从朴素矩阵乘法（每个线程为C中的一个元素加载A和B的所有元素）开始，分块实现（线程将共享存储器中一个分块中的A和B的元素分块协作加载到减少全局内存流量）提供了良好的速度。但是，在分片实现中，对全局内存的访问也不是合并的顺序。因此，为了提高性能，最好转置矩阵B然后相乘。下面是我的代码，

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

#include <time.h>

#include <sys/time.h>


void querydeviceprop();
void allocate_matrix(float *h_a, float *h_b, int matDim);
void verify(float *h_c, float *h_c_check, int matDim);
void print_matrix(float *ha, int m,int n);
void transpose_matrix(float *ha, int matDim);

void mat_mul();

#define TILE_WIDTH 16 //should be equal to numThread for tiling implementation

__global__ void MatrixMult_tiling(float *d_a,float *d_b,float *d_c, int dim){

    __shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
    __shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
    int bx,by,tx,ty,i,j;
    float res;
    int row, col;

    bx=blockIdx.x;  by=blockIdx.y;
    tx=threadIdx.x; ty=threadIdx.y;

    row=by*TILE_WIDTH+ty;
    col=bx*TILE_WIDTH+tx;

    res=0;
    for(i=0;i<dim/TILE_WIDTH;i++){
        //collaboratively load the elements. Each thread loads a single element.
        ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
        tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];

        __syncthreads();
        for(j=0;j<TILE_WIDTH;j++){

            res=res+ta[ty][j]*tb[j][tx];
        }
        __syncthreads();
    }
    d_c[row*dim+col]=res;
}

__global__ void MatrixMult_tiling_coalesced(float *d_a,float *d_b,float *d_c, int dim){

    __shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
    __shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
    int bx,by,tx,ty,i,j;
    float res;
    int row, col;

    bx=blockIdx.x;  by=blockIdx.y;
    tx=threadIdx.x; ty=threadIdx.y;

    row=by*TILE_WIDTH+ty;
    col=bx*TILE_WIDTH+tx;

    res=0;
    for(i=0;i<dim/TILE_WIDTH;i++){
        //collaboratively load the elements. Each thread loads a single element.
        ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
        tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
        __syncthreads();

        for(j=0;j<TILE_WIDTH;j++){
            res=res+ta[ty][j]*tb[tx][j];
        }
        __syncthreads();
    }
    d_c[row*dim+col]=res;
}

__global__ void MatrixMult_naive(float *d_a,float *d_b,float *d_c, int dim){

    int row,col,i;

    col=blockIdx.y*blockDim.y+threadIdx.y;
    row=blockIdx.x*blockDim.x+threadIdx.x;

    float res;

    if(row<dim && col<dim){
        res=0;
        for(i=0;i<dim;i++){
            res=res+(d_a[row*dim+i]*d_b[i*dim+col]);
        }
        d_c[row*dim+col]=res;
    }
}



int main(){
    mat_mul();
return 0;
}

void mat_mul(){

    cudaSetDevice(0);

    time_t t;
    cudaError_t err = cudaSuccess;
    srand((unsigned) time(&t));

    cudaEvent_t start, stop;
    float milliseconds=0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    int matDim = 8192;

    float *h_a, *h_b, *h_c, *h_c_check;

    /*declare the host memories*/
    h_a=(float *)malloc(matDim*matDim*sizeof(float));
    h_b=(float *)malloc(matDim*matDim*sizeof(float));
    h_c=(float *)malloc(matDim*matDim*sizeof(float));
    h_c_check=(float *)malloc(matDim*matDim*sizeof(float));

    // Verify that allocations succeeded
    if (h_a == NULL || h_b == NULL || h_c == NULL || h_c_check ==NULL)
    {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
    }

    allocate_matrix(h_a,h_b,matDim); // allocate memory to hold the matrix

    //allocate cuda memory
        float *d_a=NULL;
        float *d_b=NULL;
        float *d_c=NULL;

        err=cudaMalloc((void **)&d_a, matDim*matDim*sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
        err=cudaMalloc((void **)&d_b, matDim*matDim*sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
        err=cudaMalloc((void **)&d_c, matDim*matDim*sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Matrix dimension is : %d\n",matDim);

    // Copy the host input matrix A and B in host memory to the device matrix in device memory
    //printf("Copy input data from the host memory to the CUDA device\n");

    cudaEventRecord(start);
    err = cudaMemcpy(d_a, h_a, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    //printf("GPU memcpy matrix A %10.10f ms\n",milliseconds);


    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaEventRecord(start);
    err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    //printf("GPU memcpy matrix B %10.10f ms\n",milliseconds);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


    /*constants for kernel launch*/
    int numThread=16; //number of threads per Block axis
    int numBlocks=matDim/numThread;
    if(matDim%numThread)
        numBlocks++;
    dim3 dimGrid(numBlocks,numBlocks);
    dim3 dimBlock(numThread,numThread);

    //-------------------------------------------------------------
           //-------transpose and copy to GPU-------
           transpose_matrix(h_b, matDim);//transpose first the b matrix
           err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
           cudaEventSynchronize(stop);
           if (err != cudaSuccess){
                   fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
                   exit(EXIT_FAILURE);
               }
           //--------transpose and copy ends-------------

           cudaEventRecord(start);
           MatrixMult_tiling_coalesced<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
           cudaEventRecord(stop);
           err = cudaGetLastError();

           if (err != cudaSuccess){
               fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
               exit(EXIT_FAILURE);
           }

           cudaEventSynchronize(stop);
           cudaEventElapsedTime(&milliseconds, start, stop);
           printf("GPU time tiled & coalesced %10.10f ms\n",milliseconds);

           //printf("Copy output data from the CUDA device to the host memory\n");
           cudaEventRecord(start);
           err = cudaMemcpy(h_c_check, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
           cudaEventRecord(stop);
           cudaEventSynchronize(stop);
           cudaEventElapsedTime(&milliseconds, start, stop);
           //printf("GPU memcpy time tiled & coalesced %10.10f ms\n",milliseconds);

           //------------transpose back the original B matrix----------------
           transpose_matrix(h_b, matDim);//transpose first the b matrix
           err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
           cudaEventSynchronize(stop);
           if (err != cudaSuccess){
                   fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
                   exit(EXIT_FAILURE);
               }

           //------------transpose back the original matrix ends-------------
//-------------------------------------------------------------

    cudaEventRecord(start);
    MatrixMult_tiling<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
    cudaEventRecord(stop);
    err = cudaGetLastError();

     if (err != cudaSuccess)
     {
         fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }

     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&milliseconds, start, stop);
     printf("GPU time tiled %10.10f ms\n",milliseconds);

     //printf("Copy output data from the CUDA device to the host memory\n");
     cudaEventRecord(start);
     err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
     cudaEventRecord(stop);
     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&milliseconds, start, stop);
     //printf("GPU memcpy time tiled %10.10f ms\n",milliseconds);


//-------------------------------------------------------------

    /*
    cudaEventRecord(start);
    MatrixMult_naive<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
    cudaEventRecord(stop);
    err = cudaGetLastError();

     if (err != cudaSuccess)
     {
         fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }

     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&milliseconds, start, stop);
     printf("GPU time naive %10.10f ms\n",milliseconds);

     printf("Copy output data from the CUDA device to the host memory\n");
     cudaEventRecord(start);
     err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
     cudaEventRecord(stop);
     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&milliseconds, start, stop);
     printf("GPU memcpy time naive %10.10f ms\n",milliseconds);
    */
//-------------------------------------------------------------

     verify(h_c, h_c_check, matDim);

     // Free device global memory
     err = cudaFree(d_a);

     if (err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }

     err = cudaFree(d_b);

     if (err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }

     err = cudaFree(d_c);

     if (err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }

     // Free host memory
     free(h_a);
     free(h_b);
     free(h_c);

     printf("Done\n");

}

void allocate_matrix(float *h_a, float *h_b, int matDim){

    int i,j;
    // Initialize the host input vectors
    for (i = 0; i < matDim; i++)
    {
        for(j=0;j< matDim;j++){
            h_a[i*matDim+j] = rand()%10;
            h_b[i*matDim+j] = rand()%10;
        }
    }

}

void print_matrix(float *ha, int m,int n){

    int i, j;
    for(i=0;i<m;i++){
        for(j=0;j<n;j++){
            printf("  %.1f ",ha[i*m+j]);
        }
    printf("\n");
    }
}

void transpose_matrix(float *h_a, int matDim){

    int i, j;
    int temp;
    for(i=0;i<matDim;i++)
    {
    for(j=0;j<i;j++)
    {
        temp=h_a[i*matDim+j];
        h_a[i*matDim+j]=h_a[j*matDim+i];
        h_a[j*matDim+i]=temp;
    }
    }
}


void verify(float *h_c, float *h_c_check, int matDim){

    int i,j;
    //check the code
     for (i = 0; i < matDim; i++)
     {
         for(j=0;j<matDim;j++){
         if (fabs(h_c[i*matDim+j] - h_c_check[i*matDim+j]) > 1e-5)
         {
             printf("cpu : %f , gpu : %f\t",h_c[i*matDim+j],h_c_check[i*matDim+j]);
             fprintf(stderr, "Result verification failed at element %d,%d !\n\n", i,j);
             exit(EXIT_FAILURE);
         }

         }
     }

     printf("Test PASSED\n");

}

MatrixMult_tiling_coalesced和void MatrixMult_tiling分别是对B元素进行无意内存访问的函数。

现在，问题是MatrixMult_tiling_coalesced花费的时间几乎是MatrixMult_tiling花费的时间的两倍。我了解到，在MatrixMult_tiling中，元素以合并的方式（即，按行主要顺序）加载到图块中，但每个图块均沿列排列，而MatrixMult_tiling_coalesced中的图块则沿列排列沿行排列，因此功能MatrixMult_tiling_coalesced应该比另一个更快。但实际上，我可以看到相反的说法。如果有人能指出原因，我将不胜感激。提前致意。

编辑1：在回答了罗伯特（见下文）之后，我了解到问题出在elemntwise乘法期间的加载操作中。

tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];]

到

tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];

和

res=res+ta[ty][j]*tb[tx][j];

到

res=res+ta[ty][j]*tb[j][tx];

MatrixMult_tiling_coalesced函数的性能从1500毫秒提高到1000毫秒。但是，功能MatrixMult_tiling仅花费879毫秒。因此，合并的例程仍然较慢。我不明白问题出在哪里。

编辑2：我意识到在EDIT 1中，我刚将库冲突问题从逐元素乘法转移到了元素加载部分。代码中的更改没有银行冲突，

    tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];

到

    tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];

和

res=res+ta[ty][j]*tb[j][tx];

到

res=res+ta[ty][j]*tb[ty][j];

但是它仍然比MatrixMult_tiling函数慢一点。 MatrixMult_tiling_coalesced函数需要982毫秒，而MatrixMult_tiling函数则需要870毫秒。如果速度不快，则至少应类似于MatrixMult_tiling。

最终编辑：

编辑2将不会产生正确的结果。因此，带有编辑1的代码将是最佳的。转置被乘矩阵之一可能不是一个好主意。：-（

谢谢大家的帮助。

Answer 1

B当然不是我要在C=AB中转置的矩阵。但这既不是这里也不是那里。

我不确定你为什么会这样想：

在分片实现中，对全局内存的访问也不是合并的顺序

我在您的MatrixMult_tiling中看不到任何代码行会导致未经许可的访问。

为了确保我们不会超出术语范围，“成组”或“不成类”是我们应用于全局内存（非共享内存）访问模式的术语。您的全局内存访问模式在平铺内核的以下几行中：

    ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
    tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
    ...
    d_c[row*dim+col]=res;

，并且不会对全局存储器的所有这些模式进行提示。在生成的d_a，d_b和d_c的每个索引中，如果执行替换，则会发现threadIdx.x变量存在于所有变量中，并且不乘以任何值，常数或其他。因此，这些模式都将融合在一起（很好）。

如果有人能指出原因，我将不胜感激。

在共享内存方面，您做得不好。

在切片内核中，乘法运算如下：

        res=res+ta[ty][j]*tb[j][tx];

在这种情况下：

   ta[ty][j]

我们遇到这样一种情况，即扭曲中的所有线程（其线性增长的tx值，但具有相同的ty值）正在共享内存中读取相同的位置。这是一种“最佳”访问模式-它不会引起任何银行冲突，将在最短的时间内提供服务。

在这种情况下：

  tb[j][tx]

在这种情况下，warp中的相邻线程正在读取共享内存中的相邻位置。这也是一种“最佳”，无银行冲突的模式，将在最短的时间内提供服务。

但是在您的MatrixMult_tiling_coalesced内核中，相应的乘法运算是：

  res=res+ta[ty][j]*tb[tx][j];

同样，在这种情况下：

  ta[ty][j]

我们有一个共享内存“广播”模式（经线中的所有线程都从同一位置读取），这是最佳且快速的。但是在这种情况下：

  tb[tx][j]

您实际上已经创建了对共享内存的列访问权限。对于共享内存，这是最糟糕的访问模式，它将导致加载过程进行32路串行化（如果是16x16线程块，则可能是16路串行化），并且性能肯定会变差。为什么？请记住，对于给定的载荷，j在整个经纱上是恒定的，tx在整个经纱上呈线性增加。因此，假设j在特定的循环迭代中为1。经线0中的线程将显示为：

tb[0][1], tb[1][1], tb[2][1], tb[3][1], ...

，并且这些位置都属于共享内存的特定“列”，即它们都属于同一共享内存库。这是共享内存的最坏情况。

为完整起见，我声称MatrixMult_tiling_coalesced内核中的所有全局内存访问模式也已合并：

    ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
    tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
    ...
    d_c[row*dim+col]=res;

因此，您的两个内核实现之间的全局内存访问模式/活动/效率应该没有重大差异。

作为旁注，我认为这都是学习活动。如果您对GPU上的高性能矩阵乘法感兴趣，我建议您考虑使用CUBLAS。

矩阵乘法：在CUDA中合并全局内存访问后，性能会降低

1 个答案: