#if USE_CONST == 1
__constant__ double PNT[ SIZE ];
#else
__device__ double *PNT;
#endif
稍后我有:
#if USE_CONST == 0
cudaMalloc((void **)&PNT, sizeof(double)*SIZE);
cudaMemcpy(PNT, point, sizeof(double)*SIZE, cudaMemcpyHostToDevice);
#else
cudaMemcpyToSymbol(PNT, point, sizeof(double)*SIZE);
#endif
而point
是之前在代码中定义的某个地方。使用USE_CONST=1
时,一切都按预期工作,但在没有它的情况下工作,而不是。我通过
PNT[ index ]
两种变体之间的问题在哪里? 谢谢!
答案 0 :(得分:3)
在CUDA 4.0之前正确使用cudaMemcpyToSymbol是:
cudaMemcpyToSymbol("PNT", point, sizeof(double)*SIZE)
或者:
double *cpnt;
cudaGetSymbolAddress((void **)&cpnt, "PNT");
cudaMemcpy(cpnt, point, sizeof(double)*SIZE, cudaMemcpyHostToDevice);
如果您计划多次从主机API访问该符号,则可能会更快一些。
编辑:误解了这个问题。对于全局内存版本,执行类似于常量内存的第二个版本double *gpnt;
cudaGetSymbolAddress((void **)&gpnt, "PNT");
cudaMemcpy(gpnt, point, sizeof(double)*SIZE. cudaMemcpyHostToDevice););
答案 1 :(得分:1)
虽然这是一个老问题,但我将其添加到未来的googlers中:
问题在于:
cudaMalloc((void **)&PNT, sizeof(double)*SIZE);
cudaMemcpy(PNT, point, sizeof(double)*SIZE, cudaMemcpyHostToDevice);
cudaMalloc
写入PNT
的主机版本,它实际上是一个不能从主机访问的设备变量。所以正确的是分配内存,将地址复制到设备符号并将内存复制到该符号指向的内存:
void* memPtr;
cudaMalloc(&memPtr, sizeof(double)*SIZE);
cudaMemcpyToSymbol(PNT, &memPtr, sizeof(memPtr));
// In other places you'll need an additional:
// cudaMemcpyFromSymbol(&memPtr, PNT, sizeof(memPtr));
cudaMemcpy(memPtr, point, sizeof(double)*SIZE, cudaMemcpyHostToDevice);
更容易:
#if USE_CONST == 1
__constant__ double PNT[ SIZE ];
#else
__device__ double PNT[ SIZE ];
#endif
// No #if required anymore:
cudaMemcpyToSymbol(PNT, point, sizeof(double)*SIZE);