没有用C Cuda循环的点积

时间:2017-05-10 13:35:20

标签: cuda

我正在尝试编写c-cuda代码来实现dot产品,而不在内核中使用for循环。下面的代码预先将输入向量的平铺(分别用10和15填充)整理到对应的共享浮点数组 s_in1 s_in2 ;这些数组的每个元素之间的乘法结果存储在共享浮点数组中。对于输入数组32000( inputLength = 32000 )的大小,结果是正确的(4'800'000),但是大小为320000( inputLength = 320000 )错了(48'192'608而不是48'000'000)。为什么?即使我使用变量 float block 而不是共享数组重写代码,它也会出现同样的问题。每次执行代码时结果总是相同的。在此先感谢您的帮助!

我在Jetson TX1 - CUDA 7.0上用以下代码编译代码:

nvcc mycode.cu -o mycode

这是完整的代码:

#define THREADS_PER_BLOCK 1000

__global__ void scalar_prod(float *in1, float *in2, float *out) 
{

__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
float *hostInput1;
float *hostInput2;
float  hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;

hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));

cudaMemcpy(deviceInput1, hostInput1, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2); 
return 0;     
}

1 个答案:

答案 0 :(得分:2)

代码至少有两个问题:

  1. 在开始deviceOutput操作之前,您没有初始化atomicAdd指向的存储。所以初始值是未定义的。

  2. 您超出了float算术的能力。

  3. 第1项的修复很简单 - 在运行内核之前,我们可以很容易地将其初始化为零。对于第2项,一个简单的&#34;修复&#34;将所有内容从float切换为double。但是,在您的Jetson GPU上,atomicAdd值没有方便的double内在值,但programming guide使用atomicCAS为我们提供了可能的实现。如果我们将这些结合起来,我们最终会得到一个有效的代码:

    $ cat t122.cu
    #include <stdio.h>
    #define THREADS_PER_BLOCK 1000
    
    #ifdef USE_DOUBLE
    typedef double mytype;
    #else
    typedef float mytype;
    #endif
    
    __device__ double my_atomicAdd(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull, assumed;
     do {
          assumed = old;
          old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
        } while (assumed != old);
      return __longlong_as_double(old);
    }
    __device__ float my_atomicAdd(float *addr, float val){
      return atomicAdd(addr, val);
    }
    
    __global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
    {
    __shared__ mytype block[THREADS_PER_BLOCK];
    __shared__ mytype s_in1[THREADS_PER_BLOCK];
    __shared__ mytype s_in2[THREADS_PER_BLOCK];
    
    unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
    s_in1[threadIdx.x]=in1[xIndex];
    s_in2[threadIdx.x]=in2[xIndex];
    
    block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
    __syncthreads();
    my_atomicAdd(out, block[threadIdx.x]);
    }
    
    int main()
    {
    
    int inputLength=320000;
    mytype *hostInput1;
    mytype *hostInput2;
    mytype  hostOutput=0;
    mytype *deviceInput1;
    mytype *deviceInput2;
    mytype *deviceOutput;
    unsigned int i;
    
    hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
    hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
    
    for(i=0;i<inputLength;++i)
    {
      hostInput1[i]=10;
      hostInput2[i]=15;
    }
    
    cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceOutput, sizeof(mytype));
    
    cudaMemcpy(deviceInput1, hostInput1, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    cudaMemcpy(deviceInput2, hostInput2, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    cudaMemcpy(deviceOutput, &hostOutput,
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    dim3 blockDim(THREADS_PER_BLOCK);
    dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
    
    scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
    
    cudaDeviceSynchronize();
    
    cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
    
    printf("\n result:%f \n",hostOutput);
    
    cudaFree(deviceInput1);
    cudaFree(deviceInput2);
    cudaFree(deviceOutput);
    free(hostInput1);
    free(hostInput2);
    return 0;
    }
    $ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
    $ ./t122
    
     result:48000000.000000
    $
    

    请注意,此代码中有许多其他项目没有多大意义 - 例如,此处使用__shared__内存并没有给您任何好处,因为没有实际的共享线程之间的数据,矢量点积中没有输入数据重用。但这些似乎并不是您问题的焦点,也不会使代码不正确。