我正在尝试编写c-cuda代码来实现dot产品,而不在内核中使用for循环。下面的代码预先将输入向量的平铺(分别用10和15填充)整理到对应的共享浮点数组 s_in1 和 s_in2 ;这些数组的每个元素之间的乘法结果存储在共享浮点数组块中。对于输入数组32000( inputLength = 32000 )的大小,结果是正确的(4'800'000),但是大小为320000( inputLength = 320000 )错了(48'192'608而不是48'000'000)。为什么?即使我使用变量 float block 而不是共享数组重写代码,它也会出现同样的问题。每次执行代码时结果总是相同的。在此先感谢您的帮助!
我在Jetson TX1 - CUDA 7.0上用以下代码编译代码:
nvcc mycode.cu -o mycode
这是完整的代码:
#define THREADS_PER_BLOCK 1000
__global__ void scalar_prod(float *in1, float *in2, float *out)
{
__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];
unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
block[threadIdx.x] = s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}
int main()
{
int inputLength=320000;
float *hostInput1;
float *hostInput2;
float hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;
hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));
for(i=0;i<inputLength;++i)
{
hostInput1[i]=10;
hostInput2[i]=15;
}
cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));
cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(float),cudaMemcpyHostToDevice);
dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);
printf("\n result:%f \n",hostOutput);
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
答案 0 :(得分:2)
代码至少有两个问题:
在开始deviceOutput
操作之前,您没有初始化atomicAdd
指向的存储。所以初始值是未定义的。
您超出了float
算术的能力。
第1项的修复很简单 - 在运行内核之前,我们可以很容易地将其初始化为零。对于第2项,一个简单的&#34;修复&#34;将所有内容从float
切换为double
。但是,在您的Jetson GPU上,atomicAdd
值没有方便的double
内在值,但programming guide使用atomicCAS
为我们提供了可能的实现。如果我们将这些结合起来,我们最终会得到一个有效的代码:
$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
__device__ double my_atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
return atomicAdd(addr, val);
}
__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];
unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
block[threadIdx.x] = s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}
int main()
{
int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;
hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
for(i=0;i<inputLength;++i)
{
hostInput1[i]=10;
hostInput2[i]=15;
}
cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));
cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);
dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
printf("\n result:%f \n",hostOutput);
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122
result:48000000.000000
$
请注意,此代码中有许多其他项目没有多大意义 - 例如,此处使用__shared__
内存并没有给您任何好处,因为没有实际的共享线程之间的数据,矢量点积中没有输入数据重用。但这些似乎并不是您问题的焦点,也不会使代码不正确。