我写了这样的示例代码。
int ** d_ptr;
cudaMalloc( (void**)&d_ptr, sizeof(int*)*N );
int* tmp_ptr[N];
for(int i=0; i<N; i++)
cudaMalloc( (void**)&tmp_ptr[i], sizeof(int)*SIZE );
cudaMemcpy(d_ptr, tmp_ptr, sizeof(tmp_ptr), cudaMemcpyHostToDevice);
这段代码运行良好但在内核启动后我无法收到结果。
int* Mtx_on_GPU[N];
cudaMemcpy(Mtx_on_GPU, d_ptr, sizeof(int)*N*SIZE, cudaMemcpyDeviceToHost);
此时,发生段错误错误。但我不知道自己错了什么。
int* Mtx_on_GPU[N];
for(int i=0; i<N; i++)
cudaMemcpy(Mtx_on_GPU[i], d_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
此代码也有同样的错误。
我认为我的代码当然有一些错误,但我无法在白天找到它。
给我一些建议。
答案 0 :(得分:6)
在最后一行
cudaMemcpy(Mtx_on_GPU[i], d_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
您正在尝试将数据从设备复制到主机(注意:我假设您为Mtx_on_GPU
指针分配了主机内存!)
但是,指针存储在设备内存中,因此您无法直接从主机端访问。该行应
cudaMemcpy(Mtx_on_GPU[i], temp_ptr[i], sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
使用“过于复杂”的变量名称时,这可能会变得更加清晰:
int ** devicePointersStoredInDeviceMemory;
cudaMalloc( (void**)&devicePointersStoredInDeviceMemory, sizeof(int*)*N);
int* devicePointersStoredInHostMemory[N];
for(int i=0; i<N; i++)
cudaMalloc( (void**)&devicePointersStoredInHostMemory[i], sizeof(int)*SIZE );
cudaMemcpy(
devicePointersStoredInDeviceMemory,
devicePointersStoredInHostMemory,
sizeof(int*)*N, cudaMemcpyHostToDevice);
// Invoke kernel here, passing "devicePointersStoredInDeviceMemory"
// as an argument
...
int* hostPointersStoredInHostMemory[N];
for(int i=0; i<N; i++) {
int* hostPointer = hostPointersStoredInHostMemory[i];
// (allocate memory for hostPointer here!)
int* devicePointer = devicePointersStoredInHostMemory[i];
cudaMemcpy(hostPointer, devicePointer, sizeof(int)*SIZE, cudaMemcpyDeviceToHost);
}
编辑回应评论:
d_ptr
是“一个指针数组”。但是这个数组的内存是用cudaMalloc
分配的。这意味着它位于设备上。与此相反,使用int* Mtx_on_GPU[N];
,您在主机内存中“分配”N个指针。您也可以使用malloc
,而不是指定数组大小。比较以下分配时可能会更清楚:
int** pointersStoredInDeviceMemory;
cudaMalloc((void**)&pointersStoredInDeviceMemory, sizeof(int*)*N);
int** pointersStoredInHostMemory;
pointersStoredInHostMemory = (void**)malloc(N * sizeof(int*));
// This is not possible, because the array was allocated with cudaMalloc:
int *pointerA = pointersStoredInDeviceMemory[0];
// This is possible because the array was allocated with malloc:
int *pointerB = pointersStoredInHostMemory[0];
跟踪
可能有点大脑扭曲但幸运的是,它几乎不会超过2个间接。