Question

我写了这样的示例代码。

int ** d_ptr;
cudaMalloc( (void**)&d_ptr, sizeof(int*)*N );

int* tmp_ptr[N];
for(int i=0; i<N; i++)
    cudaMalloc( (void**)&tmp_ptr[i], sizeof(int)*SIZE );
cudaMemcpy(d_ptr, tmp_ptr, sizeof(tmp_ptr), cudaMemcpyHostToDevice);

这段代码运行良好但在内核启动后我无法收到结果。

int* Mtx_on_GPU[N];
cudaMemcpy(Mtx_on_GPU, d_ptr, sizeof(int)*N*SIZE, cudaMemcpyDeviceToHost);

此时，发生段错误错误。但我不知道自己错了什么。

int* Mtx_on_GPU[N];
for(int i=0; i<N; i++)
    cudaMemcpy(Mtx_on_GPU[i], d_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);

此代码也有同样的错误。

我认为我的代码当然有一些错误，但我无法在白天找到它。

给我一些建议。

Answer 1

在最后一行

cudaMemcpy(Mtx_on_GPU[i], d_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);

您正在尝试将数据从设备复制到主机（注意：我假设您为Mtx_on_GPU指针分配了主机内存！）

但是，指针存储在设备内存中，因此您无法直接从主机端访问。该行应

cudaMemcpy(Mtx_on_GPU[i], temp_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);

使用“过于复杂”的变量名称时，这可能会变得更加清晰：

int ** devicePointersStoredInDeviceMemory;
cudaMalloc( (void**)&devicePointersStoredInDeviceMemory, sizeof(int*)*N);

int* devicePointersStoredInHostMemory[N];
for(int i=0; i<N; i++)
    cudaMalloc( (void**)&devicePointersStoredInHostMemory[i], sizeof(int)*SIZE );

cudaMemcpy(
    devicePointersStoredInDeviceMemory, 
    devicePointersStoredInHostMemory,
    sizeof(int*)*N, cudaMemcpyHostToDevice);

// Invoke kernel here, passing "devicePointersStoredInDeviceMemory"
// as an argument
...

int* hostPointersStoredInHostMemory[N];
for(int i=0; i<N; i++) {
    int* hostPointer = hostPointersStoredInHostMemory[i]; 
    // (allocate memory for hostPointer here!)

    int* devicePointer = devicePointersStoredInHostMemory[i];

    cudaMemcpy(hostPointer, devicePointer, sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
}

编辑回应评论：

d_ptr是“一个指针数组”。但是这个数组的内存是用cudaMalloc分配的。这意味着它位于设备上。与此相反，使用int* Mtx_on_GPU[N];，您在主机内存中“分配”N个指针。您也可以使用malloc，而不是指定数组大小。比较以下分配时可能会更清楚：

int** pointersStoredInDeviceMemory;
cudaMalloc((void**)&pointersStoredInDeviceMemory, sizeof(int*)*N);

int** pointersStoredInHostMemory;
pointersStoredInHostMemory = (void**)malloc(N * sizeof(int*));

// This is not possible, because the array was allocated with cudaMalloc:
int *pointerA = pointersStoredInDeviceMemory[0];

// This is possible because the array was allocated with malloc:    
int *pointerB = pointersStoredInHostMemory[0];

跟踪

可能有点大脑扭曲

指针存储
指针指向

但幸运的是，它几乎不会超过2个间接。

CUDA双指针内存复制

1 个答案: