Question

我正在尝试在设备上分配矩阵，在内核中填入一些数字，然后将其复制回主机。问题是在主机上只有一行似乎被填满。

我有这样的事情：

9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1

这是我的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

void check(cudaError x) {
    fprintf(stderr, "%s\n", cudaGetErrorString(x));
}

void showMatrix2(int* v1, int width, int height) {
    printf("---------------------\n");
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < height; j++) {
            printf("%d ", v1[i * width + j]);
        }
        printf("\n");
    }
}

__global__ void kernel(int* tab,int width, int height, int pitch) {

    int row = threadIdx.x + blockIdx.x * blockDim.x;
    int col = threadIdx.y + blockIdx.y * blockDim.y;

    if (row < width && col < height) {
        tab[col * pitch + row] = 9;
    }
}

int main()
{
    int width = 4;
    int height = 4;

    int* d_tab;
    int* h_tab;

    int realSize = width * height* sizeof(int);

    size_t pitch;
    check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
    h_tab = (int*)malloc(realSize);
    check( cudaMemset(d_tab, 0, realSize) );

    dim3 grid(4, 4);
    dim3 block(4, 4);
    kernel <<<grid, block>>>(d_tab, width, height, pitch);

    check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );

    showMatrix2(h_tab, width, height);
    printf("\nPitch size: %d \n", pitch);
    getchar();
    return 0;
}

Answer 1

每当您遇到CUDA代码时遇到问题，除了进行错误检查外，请使用cuda-memcheck运行代码。如果你这样做了，你至少会得到一个关于发生了什么的提示，然后你可以使用像this这样的技术来继续自己的调试。即使你无法弄清楚，cuda-memcheck输出对于试图帮助你的其他人也会有用。
您的内核中写入无效。这里有多个错误。为了正确访问内核代码中的分配，我强烈建议您研究the documentation中cudaMallocPitch给出的示例。简而言之，这种索引生成刚刚被打破：
```
tab[col * pitch + row]
```
首先，pitch返回的cudaMallocPitch是字节的宽度。您不能将其用作int或float等数量的索引调整（请参阅文档）。其次，音高值最终应该乘以行索引，而不是列索引。
与您的问题无关，但如果您使用的是64位平台，则最终printf语句的格式说明符不正确，应为%ld（或更好，{{ 1}}）。

这是一个修复索引问题的代码，它似乎对我来说正常工作：

%lu

CUDA二维数组

1 个答案: