我已经实现了如下的矢量点积。
它使用CUDA 7.5与compute_20,sm_20
和const int THREADS_PER_BLOCK = 16;
进行编译。
浮游物和双打都会发生这种情况。
它最多可以n=368
,但除此之外,结果是不正确的。我想知道问题是我的实现代码还是我正在使用的值(请参阅第二个代码,初始化),例如可能是超出n=368
的加法会引入浮点错误(这可能很奇怪,因为浮点数和双精度都会发生相同的错误)。
int divUp(int total, int grain) { return (total+grain-1)/grain; }
__device__ __forceinline__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do
{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
}
while(assumed!=old);
return __longlong_as_double(old);
}
__device__ __forceinline__ float atomicAdd(float* address, float val)
{
unsigned int *ptr = (unsigned int *)address;
unsigned int old, newint, ret = *ptr;
do {
old = ret;
newint = __float_as_int(__int_as_float(old)+val);
} while((ret = atomicCAS(ptr, old, newint)) != old);
return __int_as_float(ret);
}
template<typename T>
__global__ void vecdotk(const T* a, const T* b, const int n, T* c)
{
__shared__ T temp[THREADS_PER_BLOCK];
int x = threadIdx.x+blockIdx.x*blockDim.x;
if(x==0) c[0] = 0.0;
if(x<n) {temp[threadIdx.x] = a[x]*b[x];
}
else temp[threadIdx.x] = 0.0;
__syncthreads();
if(0==threadIdx.x)
{
T sum = 0.0;
for(int j=0; j<THREADS_PER_BLOCK; ++j)
{
sum += temp[j];
}
atomicAdd(c, sum);
}
}
template<typename T>
void dot(const T* a, const T* b, const int n, T* c)
{
dim3 block(THREADS_PER_BLOCK);
dim3 grid(divUp(n, block.x), 1);
vecdotk<T><<<grid, block>>>(a, b, n, c);
cudaSafeCall(cudaGetLastError());
};
我使用以下两个主机向量来填充输入设备数组(目前我没有显示它们,因为它们是更大的库的一部分)。基本上我想计算平方序列的总和,即
// fill host vectors a and b
for(int i=0; i<n; ++i)
{
h_vec_a[i] = i+1;//__mat_rand();
h_vec_b[i] = i+1;//__mat_rand();
}
答案 0 :(得分:1)
这不起作用:
if(x==0) c[0] = 0.0;
无法保证(在CUDA中)线程0首先运行,或者在其他线程到达代码中的任何点之前该线路将运行。在启动此内核之前,您需要初始化c[0]
。否则,一些线程可能会将其原子添加到c,然后,稍后,线程0可能会将c [0]初始化为零。
此外,CUDA已经提供了float
版本的atomicAdd,您没有理由提供自己的版本。并且,运行16个线程的线程块将无法提供良好的性能(我建议只使用CUBLAS点积函数。)使用c[0]
的修复程序(删除该行代码,并在之前初始化c[0]
内核)你的代码正确地为我运行:
$ cat t372.cu
#include <stdio.h>
const int n = 2048;
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
const int THREADS_PER_BLOCK = 16;
int divUp(int total, int grain) { return (total+grain-1)/grain; }
#if 0
__device__ __forceinline__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do
{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
}
while(assumed!=old);
return __longlong_as_double(old);
}
__device__ __forceinline__ float atomicAdd(float* address, float val)
{
unsigned int *ptr = (unsigned int *)address;
unsigned int old, newint, ret = *ptr;
do {
old = ret;
newint = __float_as_int(__int_as_float(old)+val);
} while((ret = atomicCAS(ptr, old, newint)) != old);
return __int_as_float(ret);
}
#endif
template<typename T>
__global__ void vecdotk(const T* a, const T* b, const int n, T* c)
{
__shared__ T temp[THREADS_PER_BLOCK];
int x = threadIdx.x+blockIdx.x*blockDim.x;
//if(x==0) c[0] = 0.0;
if(x<n) {temp[threadIdx.x] = a[x]*b[x];
}
else temp[threadIdx.x] = 0.0;
__syncthreads();
if(0==threadIdx.x)
{
T sum = 0.0;
for(int j=0; j<THREADS_PER_BLOCK; ++j)
{
sum += temp[j];
}
atomicAdd(c, sum);
}
}
template<typename T>
cudaError_t dot(const T* a, const T* b, const int n, T* c)
{
dim3 block(THREADS_PER_BLOCK);
dim3 grid(divUp(n, block.x), 1);
vecdotk<T><<<grid, block>>>(a, b, n, c);
cudaDeviceSynchronize();
return cudaGetLastError();
};
int main(){
mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c;
int bs = n*sizeof(mytype);
h_vec_a = (mytype *)malloc(bs);
h_vec_b = (mytype *)malloc(bs);
h_c = (mytype *)malloc(sizeof(mytype));
cudaMalloc(&d_vec_b, bs);
cudaMalloc(&d_vec_a, bs);
cudaMalloc(&d_c, sizeof(mytype));
// fill host vectors a and b
for(int i=0; i<n; ++i)
{
h_vec_a[i] = i+1;//__mat_rand();
h_vec_b[i] = i+1;//__mat_rand();
}
h_c[0] = 0;
cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice);
cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice);
dot(d_vec_a, d_vec_b, n, d_c);
cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost);
mytype test_val = 0;
for (int i=0; i < n; i++)
test_val += h_vec_a[i] * h_vec_b[i];
printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val);
}
$ nvcc -arch=sm_20 -o t372 t372.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ cuda-memcheck ./t372
========= CUDA-MEMCHECK
GPU result: 2865411584.000000, CPU result: 2865411072.000000
========= ERROR SUMMARY: 0 errors
$
最后3位的数字差异是由于float
的限制,而不是由于代码中的任何错误。例如,如果您更改初始化以将每个向量初始化为全1,那么在这种情况下您将获得精确的结果。
同样,出于性能原因,可能会对您的代码提出一些批评。如果你想要一个快点产品,我建议使用CUBLAS。