我想计算CUDA中数组的所有元素的总和。我想出了这段代码。它编译没有任何错误。但结果总是为零。我从cudaMemcpyFromSymbol
获得了无效的设备符号。我不能使用任何像Thrust或Cublas这样的库。
#define TRIALS_PER_THREAD 4096
#define NUM_BLOCKS 256
#define NUM_THREADS 256
double *dev;
__device__ volatile double pi_gpu = 0;
__global__ void ArraySum(double *array)
{
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
pi_gpu = pi_gpu + array[tid];
__syncthreads();
}
int main (int argc, char *argv[]) {
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double));
double pi_gpu_h;
ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev);
cudaDeviceSynchronize();
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost);
if( cudaSuccess != err )
{
fprintf( stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
return pi_gpu_h; // this is always zero!!!
}
答案 0 :(得分:4)
符号调用副本中的符号参数不正确。它应该是这样的:
cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), 0, cudaMemcpyDeviceToHost)
答案 1 :(得分:-2)
您的代码不是线程安全的。从多个线程写入全局变量根本不安全。减少内核可以的示例如下:
//Untested code
global_void plus_reduce(int *input, int N, int *total){
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
// Each block loads its elements into shared memory
_shared_ int x[blocksize];
x[tid] = (i<N) ? input[i] : 0; // last block may pad with 0’s
_syncthreads();
// Build summation tree over elements.
for(int s=blockDim.x/2; s>0; s=s/2){
if(tid < s) x[tid] += x[tid + s];
_syncthreads();
}
// Thread 0 adds the partial sum to the total sum
if( tid == 0 )
atomicAdd(total, x[tid]);
}