我是CUDA的新手,第一次玩CUDA内核。 我有以下内核实现了convloution(非常天真),有一个虚拟循环,在全局内存中执行1000次相同元素的计算(见下文)。问题是在操作之后,结果矩阵中的一些单元格是错误的:从某个偏移开始,这些值不是人们预期的1000的倍数。 我的内核:
__global__ void conv(float *input, float *kernel, float *target)
{
for (long i = 0; i <100; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
内核的调用代码如下:
float image[1024] = {0.0};
float kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
float res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(float)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<10; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("res[0]=%f\n",res[0]);
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
exit (0);
似乎我处理了并发问题,所以它不应该是根本原因。我感谢任何帮助。
答案 0 :(得分:2)
您正在对float
值进行任意算术并期望完美准确。
float
值可以完美地存储整数到某个尾数。一旦我们超过该值,浮动操作就开始变得不精确。当然,结果中倾向于累积到最大数字的值(那些朝向res
数组末尾的值)将首先显示此效果。
让我们在内核中调用循环计数的产品,然后在total_loops
内核周围的主机代码中循环计数。对于高达700左右的total_loops
值,我得到“精确”结果,即所有结果均可被total_loops
整除。之后,当您逐渐增加total_loops
时,错误开始蔓延,从res
数组的末尾开始。
你可以切换到double
而不是float
,你的结果会有所不同,除了一个版本的atomicAdd for double不方便。但是,programming guide显示了如何创建任意原子操作,他们给出的示例恰好正在实现atomicAdd for double
因此,对代码进行以下修改可以让您探索这两个想法:
USE_DOUBLE
total_loops
如何解决问题,请将LOOPS1定义从100更改为70. 以下是代码:
#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT
#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
for (long i = 0; i <LOOPS1; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
int main(){
mytype image[1024] = {0.0};
mytype kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
mytype res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(mytype)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<LOOPS2; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i< (28*28); i++)
if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
return 0;
}
请注意,即使您使用double
,如果累积到足够大的值,问题最终也会再次出现。
另请注意,这不是真正的CUDA / GPU问题。主机代码中的float
具有类似的限制。