CUDA atomicAdd()产生错误的结果

时间:2013-06-02 11:22:36

标签: cuda

我是CUDA的新手,第一次玩CUDA内核。 我有以下内核实现了convloution(非常天真),有一个虚拟循环,在全局内存中执行1000次相同元素的计算(见下文)。问题是在操作之后,结果矩阵中的一些单元格是错误的:从某个偏移开始,这些值不是人们预期的1000的倍数。 我的内核:

__global__ void conv(float *input, float *kernel, float *target)
{
    for (long i = 0; i <100; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

内核的调用代码如下:

float image[1024] = {0.0};
float kernel[] = 
{ 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f 
};

float res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(float)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<10; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("res[0]=%f\n",res[0]);

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

exit (0);

似乎我处理了并发问题,所以它不应该是根本原因。我感谢任何帮助。

1 个答案:

答案 0 :(得分:2)

您正在对float值进行任意算术并期望完美准确。

float值可以完美地存储整数到某个尾数。一旦我们超过该值,浮动操作就开始变得不精确。当然,结果中倾向于累积到最大数字的值(那些朝向res数组末尾的值)将首先显示此效果。

让我们在内核中调用循环计数的产品,然后在total_loops内核周围的主机代码中循环计数。对于高达700左右的total_loops值,我得到“精确”结果,即所有结果均可被total_loops整除。之后,当您逐渐增加total_loops时,错误开始蔓延,从res数组的末尾开始。

你可以切换到double而不是float,你的结果会有所不同,除了一个版本的atomicAdd for double不方便。但是,programming guide显示了如何创建任意原子操作,他们给出的示例恰好正在实现atomicAdd for double

因此,对代码进行以下修改可以让您探索这两个想法:

  • 如果您想了解如何双重修复问题,请将定义更改为USE_DOUBLE
  • 相反,如果您想了解减少total_loops如何解决问题,请将LOOPS1定义从100更改为70.
  • 我还要提一下,在所有 API调用和内核调用上做cuda error checking是个好习惯(你只覆盖了几个,而不是内核),但它不是在这种情况下的一个问题。

以下是代码:

#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT

#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif

__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
    for (long i = 0; i <LOOPS1; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

int main(){

mytype image[1024] = {0.0};
mytype kernel[] =
{
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};

mytype res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(mytype)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<LOOPS2; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("results:\n");
for (int i = 0; i< (28*28); i++)
  if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

  return 0;
}

请注意,即使您使用double,如果累积到足够大的值,问题最终也会再次出现。

另请注意,这不是真正的CUDA / GPU问题。主机代码中的float具有类似的限制。