我希望将一个指针数组从一个结构复制到另一个结构。 Struct看起来像这样:
typedef struct COORD3D
{
int x,y,z;
}
COORD3D;
typedef struct structName
{
double *volume;
COORD3D size;
// .. some other vars
}
structName;
我希望在一个函数中执行此操作,在该函数中,我传入结构的空实例的地址和带有我想要复制的数据的结构的地址。目前我通过以下方式连续进行此操作:
void foo(structName *dest, structName *source)
{
// .. some other work
int size = source->size.x * source->size.y * source->size.z;
dest->volume = (double*)malloc(size*sizeof(double));
int i;
for(i=0;i<size;i++)
dest->volume[i] = source->volume[i];
}
我想在CUDA中这样做以加快进程(因为数组非常大[~1200万个元素]。
我尝试了以下但是,虽然代码编译并运行,但我得到的结果存储在数组中的结果不正确(似乎是非常大的随机数)
void foo(structName *dest, structName *source)
{
// .. some other work
int size = source->size.x * source->size.y * source->size.z;
dest->volume = (double*)malloc(size*sizeof(double));
// Device Pointers
double *DEVICE_SOURCE, *DEVICE_DEST;
// Declare memory on GPU
cudaMalloc(&DEVICE_DEST,size);
cudaMalloc(&DEVICE_SOURCE,size);
// Copy Source to GPU
cudaMemcpy(DEVICE_SOURCE,source->volume,size,
cudaMemcpyHostToDevice);
// Setup Blocks/Grids
dim3 dimGrid(ceil(source->size.x/10.0),
ceil(source->size.y/10.0),
ceil(source->size.z/10.0));
dim3 dimBlock(10,10,10);
// Run CUDA Kernel
copyVol<<<dimGrid,dimBlock>>> (DEVICE_SOURCE,
DEVICE_DEST,
source->size.x,
source->size.y,
source->size.z);
// Copy Constructed Array back to Host
cudaMemcpy(dest->volume,DEVICE_DEST,size,
cudaMemcpyDeviceToHost);
}
内核看起来像这样:
__global__ void copyVol(double *source, double *dest,
int x, int y, int z)
{
int posX = blockIdx.x * blockDim.x + threadIdx.x;
int posY = blockIdx.y * blockDim.y + threadIdx.y;
int posZ = blockIdx.z * blockDim.z + threadIdx.z;
if (posX < x && posY < y && posZ < z)
{
dest[posX+(posY*x)+(posZ*y*x)] =
source[posX+(posY*x)+(posZ*y*x)];
}
}
谁能告诉我哪里出错了?
答案 0 :(得分:0)
我冒着错误的答案,但是你遗漏了数据类型的大小吗?
cudaMalloc(&DEVICE_DEST,size);
应该是
cudaMalloc(&DEVICE_DEST,size*sizeof(double));
另外
cudaMemcpy(DEVICE_SOURCE,source->volume,size, cudaMemcpyHostToDevice);
应该是
cudaMemcpy(DEVICE_SOURCE,source->volume,size*sizeof(double), cudaMemcpyHostToDevice);
等等。