Question

我正在尝试并行使用块和线程添加两个矩阵，但我没有得到正确的结果矩阵来打印出来。我想我无法从主机与设备通信，反之亦然。

这是我得到的输出。结果矩阵搞砸了。

矩阵a： 18 27 48 28 6 16 40 15 30 41
         30 15 25 24 8 0 7 18 7 23
         0 15 47 13 26 16 6 17 39 30
         6 25 11 22 44 34 37 38 31 15
         8 16 17 0 29 6 13 3 30 46
         24 21 30 48 15 23 47 41 26 21
         25 45 14 15 27 20 44 14 7 39
         28 49 3 40 35 40 45 0 37 5
         31 17 24 1 48 6 6 2 46 42
         10 43 20 17 14 23 2 21 26 8

矩阵b： 44 40 26 14 2 49 6 20 46 36
         0 9 5 46 13 26 29 7 46 13
         39 41 30 28 4 6 34 32 43 47
         30 15 46 19 46 45 49 35 1 8
         31 16 36 7 31 38 34 25 26 24
         30 11 11 13 13 25 40 14 42 7
         30 40 49 44 13 1 44 26 13 22
         13 25 41 31 13 10 36 9 18 14
         28 25 33 31 41 27 5 11 44 31
         32 5 44 7 22 45 42 26 24 24

结果矩阵：

400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496
400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496
400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496 400848496

这是我的代码：

#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>

#define N 10
#define BLOCK_DIM 5

__global__ void matrixAdd (int *a, int *b, int *c);
int main() {

 int a[N][N], b[N][N], c[N][N];
 int *dev_a, *dev_b, *dev_c;
 int size = N * N * sizeof(int);
 time_t t;

 //initialize rand function
 srand((unsigned) time(&t));

// initialize a and b with values 

    for(int i=0; i<N; i++ )
    {
        for(int j=0; j<N; j++)
        {
            a[i][j]= rand() % 50;
            b[i][j]= rand() % 50;
        }
    }
// print matrix a   
    printf("matrix a: ");
    for(int i=0; i<N; i++)
    {
     printf("\n\t ");
        for(int j=0; j<N; j++)
        {
            printf("%d   ", a[i][j]);
        }
    printf(" ");
    }

    printf("\n ");
//print matrix b    
    printf("matrix b: ");
    for(int i=0; i<N; i++)
    {
     printf("\n\t ");
        for(int j=0; j<N; j++)
        {
            printf("%d   ", b[i][j]);
        }
    printf(" ");
    }

    printf("\n ");

//allocate memory
 cudaMalloc((void**)&dev_a, size);
 cudaMalloc((void**)&dev_b, size);
 cudaMalloc((void**)&dev_c, size);

 cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
 cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

 dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
 dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));

 matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
 /// __syncthreads(); function call from host not allowed
 cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

// print matrix c   

printf("The resultants matrix: \n");
// print matrix c   

    for(int i=0; i<N; i++)
    {
     printf("\n\t ");
        for(int j=0; j<N; j++)
        {
            printf("%d ", c);
        }
    printf(" ");
    }

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

 return 0;

}

__global__ void matrixAdd (int *a, int *b, int *c) {

 int col = blockIdx.x * blockDim.x + threadIdx.x;
 int row = blockIdx.y * blockDim.y + threadIdx.y;
 int index = col + row * N;

    if (col < N && row < N) 
    {
        c[index] = a[index] + b[index];
    }
     __syncthreads();
}

Answer 1

您的最终输出printf声明中有拼写错误。这样：

printf("%d ", c);

应该是这样的：

printf("%d ", c[i][j]);

（与之前的printf语句一致）

FWIW，内核中的最后__syncthreads()语句没有用处。

无法打印出正确的CUDA矩阵添加结果

1 个答案: