Question

我正在尝试编写c-cuda代码来实现dot产品，而不在内核中使用for循环。下面的代码预先将输入向量的平铺（分别用10和15填充）整理到对应的共享浮点数组 s_in1 和 s_in2 ;这些数组的每个元素之间的乘法结果存储在共享浮点数组块中。对于输入数组32000（ inputLength = 32000 ）的大小，结果是正确的（4'800'000），但是大小为320000（ inputLength = 320000 ）错了（48'192'608而不是48'000'000）。为什么？即使我使用变量 float block 而不是共享数组重写代码，它也会出现同样的问题。每次执行代码时结果总是相同的。在此先感谢您的帮助！

我在Jetson TX1 - CUDA 7.0上用以下代码编译代码：

nvcc mycode.cu -o mycode

这是完整的代码：

#define THREADS_PER_BLOCK 1000

__global__ void scalar_prod(float *in1, float *in2, float *out) 
{

__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
float *hostInput1;
float *hostInput2;
float  hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;

hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));

cudaMemcpy(deviceInput1, hostInput1, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2); 
return 0;     
}

Answer 1

代码至少有两个问题：

在开始deviceOutput操作之前，您没有初始化atomicAdd指向的存储。所以初始值是未定义的。
您超出了float算术的能力。

第1项的修复很简单 - 在运行内核之前，我们可以很容易地将其初始化为零。对于第2项，一个简单的＆＃34;修复＆＃34;将所有内容从float切换为double。但是，在您的Jetson GPU上，atomicAdd值没有方便的double内在值，但programming guide使用atomicCAS为我们提供了可能的实现。如果我们将这些结合起来，我们最终会得到一个有效的代码：

$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000

#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif

__device__ double my_atomicAdd(double* address, double val) {
 unsigned long long int* address_as_ull = (unsigned long long int*)address;
 unsigned long long int old = *address_as_ull, assumed;
 do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
  return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
  return atomicAdd(addr, val);
}

__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype  hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;

hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));

cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);

cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122

 result:48000000.000000
$

请注意，此代码中有许多其他项目没有多大意义 - 例如，此处使用__shared__内存并没有给您任何好处，因为没有实际的共享线程之间的数据，矢量点积中没有输入数据重用。但这些似乎并不是您问题的焦点，也不会使代码不正确。

没有用C Cuda循环的点积

1 个答案: