这是我第一次在CUDA中实现结构。在下面的程序中,我将结构复制到GPU并对数据执行基本操作,并将结果复制回主机。
#include<stdio.h>
inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %sn", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
typedef struct myStruct {
int* a;
int b;
}MyStruct;
__global__ void structOperation(MyStruct *d_data){
int idx = threadIdx.x;
d_data->a[idx] += 10;
}
int main(){
MyStruct *h_data, *d_data, *out_data;
size_t structSize = sizeof(MyStruct);
size_t intSize = sizeof(int);
h_data = (MyStruct *) malloc(structSize * 1);
h_data->b = 32;
h_data->a = (int *)malloc(intSize * h_data->b);
out_data = (MyStruct *) malloc(structSize * 1);
out_data->b = 32;
out_data->a = (int *)malloc(intSize * out_data->b);
for(int i = 0; i<32; i++){
h_data->a[i] = i;
}
//Memory allocation for the Struct
checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
checkCuda(cudaMalloc(&(d_data->a), sizeof(int) * 32));
checkCuda(cudaMemcpy(&d_data, &h_data, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(&(d_data->a), &(h_data->a), sizeof(int) * 32, cudaMemcpyHostToDevice));
structOperation<<<1,32>>>(d_data);
checkCuda(cudaMemcpy(&out_data, &d_data, sizeof(myStruct) * 1, cudaMemcpyDeviceToHost));
//cudaMemcpy(&(out_data->a), &(d_data->a), sizeof(int) * d_data->b, cudaMemcpyDeviceToHost);
printf("\nDataElements : ");
for(int i = 0; i<32; i++){
printf(" %d",out_data->a[i]);
}
printf("\n");
}
我得到了“细分错误”#39;作为执行的结果。我想我正在错误地操作结构。实施的正确方法是什么?
答案 0 :(得分:7)
提供的代码中存在多个无效的内存访问。
cudaMalloc
等主机访问设备内存(使用d_data->a
分配)将导致未定义的行为(分段错误等)。cudaMemcpy
将指针作为参数,而不是指针的地址。因此,cudaMemcpy(&d_data, &h_data...
应替换为cudaMemcpy(d_data, h_data...
。使用设备指针作为成员分配设备对象有点棘手。它可以实现如下:
MyStruct temp
)。cudaMalloc(&temp.a, bytes)
)。cudaMalloc(&d_data, sizeof(MyStruct)
)。cudaMemcpy(d_data, &temp, sizeof(MyStruct), cudaMemcpyHostToDevice)
)。请注意,当您修改设备上d_data->a
的内容时,temp.a
也会被修改,因为它们实际上指向设备上的相同内存位置。
您的最终主要功能将如下所示:
int main(){
MyStruct *h_data, *d_data, *out_data;
size_t structSize = sizeof(MyStruct);
size_t intSize = sizeof(int);
h_data = (MyStruct *) malloc(structSize * 1);
h_data->b = 32;
h_data->a = (int *)malloc(intSize * h_data->b);
out_data = (MyStruct *) malloc(structSize * 1);
out_data->b = 32;
out_data->a = (int *)malloc(intSize * out_data->b);
for(int i = 0; i<32; i++){
h_data->a[i] = i;
}
//Create temporary MyStruct object on host and allocate memory to its member "a" on device
MyStruct temp;
temp.b = h_data->b;
checkCuda(cudaMalloc(&temp.a, 32 * sizeof(int)));
//Copy host data to temp.a
checkCuda(cudaMemcpy(temp.a, h_data->a, 32 * sizeof(int), cudaMemcpyHostToDevice));
//Memory allocation for the device MyStruct
checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
//Copy actual object to device
checkCuda(cudaMemcpy(d_data, &temp, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));
structOperation<<<1,32>>>(d_data);
//temp.a will be updated after kernel launch
checkCuda(cudaMemcpy(out_data->a, temp.a, 32 * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nDataElements : ");
for(int i = 0; i<32; i++)
{
printf(" %d",out_data->a[i]);
}
printf("\n");
checkCuda(cudaFree(temp.a));
checkCuda(cudaFree(d_data));
free(h_data->a);
free(out_data->a);
free(h_data);
free(out_data);
}