Question

我正在尝试模拟cuda C中的矩阵乘法。除输出外，一切都是正确的。

这是我的计划：

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define     N       4
#define TILE_WIDTH 2

__global__ void MatMul(int*A, int* B, int* C) {  

    int sum; 
    int idx = threadIdx.x; 
    int idy = threadIdx.y; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int k ,uidx , uidy , i; 
    uidx = bx*TILE_WIDTH + idx;
    uidy = by*TILE_WIDTH + idy; 
    sum = 0;


    // Allocating memory in shared memory

    __shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
    __shared__ int temp2[TILE_WIDTH][TILE_WIDTH];

    //copying the data to shared memory 

    for( i =0;i<N/TILE_WIDTH; i++) 
    { 
        temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N]; 
        temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx]; 
        __syncthreads();

        // multiplying matrices in shared memory 

        for(k=0 ; k < TILE_WIDTH;k++) {
            sum = sum + temp1[idy][k]*temp2[k][idx];
        }
    }

    // synchronizing the threads 

    __syncthreads(); 
    C[uidy*N + uidx] = sum;
}

int main( void ) {

    int a[N][N], b[N][N], c[N][N];     //host copies of a,b,c

    int *dev_a, *dev_b, *dev_c;        //device copies of a,b,c

    // allocate the memory on the GPU
    cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
    cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

    // fill the matrices 'a' and 'b' on the CPU

    for (int i=0; i<N; i++) {
        for (int j=0; j < N; j++) {
            a[i][j] = j+3;
            b[i][j] = i+6;
        }
    }
    //copy above a,b values to device

    cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
    // Prepare timer
    cudaEvent_t start, stop;
    float time;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //start record
    cudaEventRecord(start, 0);

    // Kernel invocation with N threads 
    dim3 dimGrid(2,2,1); 
    dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
    MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);

    //stop record
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    //this is operation time
    cudaEventElapsedTime(&time, start, stop);

    //clean up      
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //copy result to host
    cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

    //output..
    for (int i=0; i < N; i++){
        for (int j=0; j < N; j++)
            printf( "%d ", a[i][j]);
        printf ("  ");
        for (int j=0; j < N; j++)
            printf( "%d ", b[i][j]);
        printf ("  =  ");
        for (int j=0; j < N; j++)
            printf( "%d ", c[i][j]);
        printf ("\n");
    }



    //free the allocated memory in device
    cudaFree( dev_a );
    cudaFree( dev_b );
    cudaFree( dev_c );
    printf("\n multiplication done!!!\n");
    printf("\n");
    printf(" time elapsed in ms=%f\n",time);
    getch();
    return 0;
}

这是我的输出：

3 4 5 6     6 6 6 6         108 108 115 115
3 4 5 6     7 7 7 7         108 108 115 115
3 4 5 6     8 8 8 8         108 108 115 115
3 4 5 6     9 9 9 9         108 108 115 115

显示错误的值。请告诉我程序中的任何错误。我对CUDA C很新。

Answer 1

虽然我不知道您的程序有什么问题，但我认为您应该能够使用更简单的矩阵更好地诊断它。您是否尝试过两个Identity矩阵的乘法？或者充满了所有1。使用各种简单矩阵的重复测试应该证明细胞是如何组合的。

最终，我认为您会发现使用TILE_WIDTH的方式存在问题，但我无法确定。

Answer 2

这应该修复它（在 i 循环中）：

temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];

在cuda c程序中获得错误的值

2 个答案: