在此cuda代码中,具有动态分配的成员变量的结构体数组的深层复制存在问题。我认为是因为&deviceHistogram
指向主机上的地址而不是设备上的地址。我尝试像here中那样制作一个中间指针变量,但这没有用;如何正确复制整个结构数组,以便可以从makeHistogram
函数进行修改?
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
__host__ __device__ void outputHistogram(histogramBin* histogram, int size) {
for (int i = 0; i < size; i++) {
printf("%d: ", i);
if (!histogram[i].count) {
printf("EMPTY");
} else {
for (int j = 0; j < histogram[i].count; j++) {
printf("%d ", histogram[i].items[j]);
}
}
printf("\n");
}
}
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin** histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = (*histogram)[thisBin].count; // **** out of memory access here****
(*histogram)[thisBin].items[position] = rH[r];
(*histogram)[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(&deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
将其正确复制到设备后,如何将其恢复到主机上?例如,如果我从outputHistogram(histogram, 5);
内部调用makeHistogram
,则会看到以下内容:
0: 2 4
1: 1 3 5
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
这是我期望的输出。
当我从outputHistogram(histogram, 8)
(在histogramDriver
之后)呼叫cudaDeviceSynchronize()
时,会看到以下内容:
0: EMPTY
1: EMPTY
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
很显然,我没有正确地将值从设备复制回主机。
我尝试通过执行与histogramDriver
中的操作相反的步骤进行复制:
for(int i = 0; i < n; i++){
cudaMemcpy(&(tempData[i]), &(deviceHistogram[i].items), sizeof(int*), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(histogram[i].items, tempData[i], rSize * sizeof(int), cudaMemcpyDeviceToHost);
}
但是outputHistogram
中histogramDriver
调用的输出保持不变。
答案 0 :(得分:2)
如@talonmies所示,这里最大的问题是内核的设计。没有理由/不需要为histogram
使用双指针(事实上,您发布的代码的第一版迭代在内核原型中没有,尽管它不完整)。
通过删除双指针方面,您的代码将运行,而没有任何运行时错误。
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin* histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = histogram[thisBin].count;
histogram[thisBin].items[position] = rH[r];
histogram[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
const int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
$ nvcc t1452.cu -o t1452
$ cuda-memcheck ./t1452
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
请注意,这里唯一的更改是内核代码本身,加上内核调用中的&符号的删除,以及我在const
的定义中添加了rSize
以便进行编译。
我不知道它是否产生正确的输出,因为您没有包括检查输出的方法,也没有指出您期望的输出是什么。如果您对此感兴趣,可以将它们包括在MVE中。