Question

我正在尝试使用CUDA实现矩阵乘法。我有两个矩阵M w和w N.我在每个块中启动了（w * w）线程，并且网格维度=（M / w，N / w）。我在大小为32 * 32的共享内存中创建了一个矩阵。我想在共享内存中只使用一个矩阵来实现矩阵乘法。这是我的代码

#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
#include<stdlib.h>
#include<unistd.h>
#include<math.h>

__global__ void add(int *a,int *b, int *c,int *p,int *q){

    // __shared__ int aTile[*p][*p];
    //const int a=*p;
    __shared__ int aTile[32][32];

    int row = blockIdx.x*blockDim.x+threadIdx.x;
    int col = blockIdx.y*blockDim.y+threadIdx.y;
    int sum=0;
    aTile[threadIdx.x][threadIdx.y] = a[row*(*p)+threadIdx.y];

    __syncthreads();
    if(row< *q && col< *q) 
    {
        for(int k=0;k<*p;k++)
        {
            sum+= aTile[threadIdx.x][k]*b[col+(*q)*k];
            // __syncthreads();
        }

        c[col+(*q)*row]=sum;
        //__syncthreads();
    }
}


int main(){
    printf("Enter the number of rows of matrix 1\n");
    int row_1;
    scanf("%d",&row_1);
    printf("Enter the number of columns of matrix 1\n");
    int col_1; 
    scanf("%d",&col_1);
    /*printf("Enter the values of matrix 1 \n");
     */
    int a[row_1][col_1];
    for(int i=0;i<row_1;i++)
    {
        for(int j=0;j<col_1;j++)
        {
            //scanf("%d",&a[i][j]);
            a[i][j]=1;
        }
    }

    printf("Enter the number of rows of matrix 2\n");
    int row_2;
    scanf("%d",&row_2);
    printf("Enter the number of columns of matrix 2\n");
    int col_2;
    scanf("%d",&col_2);
    /*  printf("Enter the values of matrix 2 \n");
     */
    int b[row_2][col_2];
    for(int i=0;i<row_2;i++)
    {
        for(int j=0;j<col_2;j++)
        {
            //  scanf("%d",&b[i][j]);
            b[i][j]=1;
        }
    }

    int c[row_1][col_2];
    //dim3 dimBlock(col_1, col_1);// in one block u have row_1*col_2 threads;
    dim3 dimBlock(col_1,col_1);
    //dim3 dimGrid((row_1/col_1)+1,(col_2/col_1)+1); // in one grid you have 1*1 blocks
    dim3 dimGrid(ceil(row_1/col_1),ceil(col_2/col_1));
    int *p;
    int *q;
    int *dev_a,*dev_b,*dev_c;
    int size_a=row_1*col_1*sizeof(int);
    int size_b=row_2*col_2*sizeof(int);
    int size_c = row_1*col_2*sizeof(int);
    cudaMalloc((void**)&dev_a,size_a);
    cudaMalloc((void**)&dev_b,size_b);
    cudaMalloc((void**)&dev_c,size_c);
    cudaMalloc((void**)&p,sizeof(int));
    cudaMalloc((void**)&q,sizeof(int));

    cudaMemcpy(dev_a,a,size_a,cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b,b,size_b,cudaMemcpyHostToDevice);
    cudaMemcpy(dev_c,c,size_c,cudaMemcpyHostToDevice);
    cudaMemcpy(p,&col_1,sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(q,&col_2,sizeof(int),cudaMemcpyHostToDevice);

    add<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c,p,q);
    cudaMemcpy(c,dev_c,size_c,cudaMemcpyDeviceToHost);
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
    printf("output matrix is : \n");
    for(int i=0;i<10;i++)
    {
        for(int j=0;j<10;j++)
        {
            printf("%d ",c[i][j]);
        }
        printf("\n");
    }
}

当我乘以大小为32 * 32和32 * 32的矩阵时，我得到了正确的输出但当我乘以大小为33 * 33和33 * 33（及以上）的矩阵时，得到的乘法矩阵包含全零。我试图增加共享内存中矩阵的大小，但我得到以下错误

ptxas error   : Entry function '_Z3addPiS_S_S_S_' uses too much shared data (0x10038 bytes, 0x4000 max)

我对CUDA很新。对不起，如果这是一个太基本的问题

Answer 1

这是一个基本问题，已多次回答。

首先，每当您遇到CUDA代码时遇到问题，请使用proper cuda error checking。在这种情况下，您会收到一个本来有用的错误。
CUDA内核对每个线程块的最大线程数有限制。在所有支持的设备上，该限制（在CUDA 7，7.5RC，当前）是每块1024个线程。每个块的线程数由dimBlock变量指定（在本例中），并且它是每个维度中术语的产品：
```
dim3 dimBlock(col_1,col_1);
add<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c,p,q);
```

因此，当col_1为32时，您要求32x32个线程（1024），这是最大值。因此，任何高于32x32的值都将失败。（你的内核不会启动。当你在这里指定33x33时，不会执行内核代码。）

我建议您不要重写此代码来修复所有问题，而是建议您研究已经提出的有关矩阵乘法的几十个问题，这里是cuda标签。实际上，如果你想在CUDA中看到用于天真矩阵乘法的共享内存优化代码，the programming guide中有一个完整的例子（包括非共享版本和共享版本用于比较）。

我再次建议您在此处寻求帮助之前实施proper cuda error checking。即使您不理解错误结果，对于那些试图帮助您的人来说，这也是有用的信息。

Answer 2

这一行有一个溢出：

aTile[threadIdx.x][threadIdx.y] = a[row*(*p)+threadIdx.y];

知道aTile被定义为__shared__ int aTile[32][32];

如果你想做平铺，你必须循环覆盖你的matrice所需的瓷砖数量。

无法在CUDA中乘以32 * 32大小的矩阵

2 个答案: