我正在使用以下CUDA内核:
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
使用此代码启动:
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
它工作正常(没有运行时错误并产生正确的结果)。但是,如果我将i += blockDim.x
更改为i += 32
,则下次拨打cudaDeviceSynchronize()
时会收到错误消息:
Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97
使用cuda-memcheck
运行内核:
========= Invalid __global__ read of size 4
========= at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
========= by thread (31,0,0) in block (0,0,0)
========= Address 0x703b70d7c is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
========= Host Frame:./perfmeas [0x17c7]
========= Host Frame:./perfmeas [0x16b7]
========= Host Frame:./perfmeas [0x16e2]
========= Host Frame:./perfmeas [0x153f]
========= Host Frame:./perfmeas [0xdcd]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:./perfmeas [0xf39]
....
地址0x703b70d7c确实超出data
的范围:数组从0x703b40000开始并具有MAX_INDEX元素。在此测试中MAX_INDEX为50000。 (0x703b70d7c - 0x703b40000)/ 4 = 50015.
为i >= 50000
添加额外检查会让问题神奇地消失:
for (int i = idx; i < MAX_INDEX; i += 32) {
if (i >= MAX_INDEX)
printf("WTF!\n");
thread_sum += data[i];
}
这是CUDA中的错误还是我在这里做了些蠢事?
我在Ubuntu 2016.04上使用CUDA 7.5。输出nvcc --version
:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
此测试用例的完整源代码可在此处找到:
http://svn.clifford.at/handicraft/2016/perfmeas
(使用选项-gx
运行。此版本使用i += blockDim.x
。将其更改为i += 32
以重现此问题。)
编辑:@njuffa在评论中说他不想关注堆栈溢出链接,因为他“太害怕[他的]电脑可能会抓到一些东西”而且更喜欢他可以复制和粘贴堆栈的测试用例直接溢出。所以这就是:
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>
#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)
__global__
void primes_worker(int *data)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= MAX_INDEX)
return;
int p = 2*idx+1;
for (int i = 3; i*i <= p; i += 2) {
if (p % i == 0) {
data[idx] = 0;
return;
}
}
data[idx] = idx ? p : 0;
}
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
#ifdef ENABLE_BUG
for (int i = idx; i < MAX_INDEX; i += 32)
thread_sum += data[i];
#else
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
#endif
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
int *primes_or_zeros;
int *sum_buffer;
void primes_gpu_init()
{
cudaError_t err;
err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMallocHost((void**)&sum_buffer, sizeof(int));
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
void primes_gpu_done()
{
cudaError_t err;
err = cudaFree(primes_or_zeros);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaFreeHost(sum_buffer);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
int primes_gpu()
{
int num_blocks = (MAX_INDEX + 31) / 32;
int num_treads = 32;
primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
return *sum_buffer;
}
int main()
{
primes_gpu_init();
int result = primes_gpu();
printf("Result: %d\n", result);
if (result != 454396537) {
printf("Incorrect result!\n");
return 1;
}
primes_gpu_done();
return 0;
}
用法:
$ nvcc -o demo demo.cu
$ ./demo
Result: 454396537
$ nvcc -D ENABLE_BUG -o demo demo.cu
$ ./demo
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!
答案 0 :(得分:4)
TL; DR :观察到的行为很可能是由CUDA 7.5工具链的ptxas
组件中的错误引起的,特别是循环展开器。该错误可能已在CUDA 8.0 RC中修复,该版本已公开发布。
我能够在64位Windows 7平台上使用Quadro K2200 GPU(sm_50
设备重现问题中报告的行为。生成的机器代码(SASS)与ENABLE_BUG
定义的主要区别在于循环展开了四倍。这是循环增量从变量(即threadIdx.x
)更改为编译时常量32
的直接结果,它允许编译器在编译时计算行程计数。
值得注意的是,在中间PTX级别,即使增加32
,循环也会滚动:
BB7_4:
ld.global.u32 %r12, [%rd10];
add.s32 %r16, %r12, %r16;
add.s64 %rd10, %rd10, 128;
add.s32 %r15, %r15, 32;
setp.lt.s32 %p3, %r15, 50000;
@%p3 bra BB7_4;
当循环在机器代码中展开时,它必须是应用该转换的ptxas
展开器。
如果我将ptxas
优化级别降低到-O1
,通过在-Xptxas -O1
命令行上指定nvcc
,代码将按预期工作。如果我为sm_30
构建代码(在sm_50
设备上运行时导致JIT编译),则在运行最新驱动程序Windows 369.26时,代码将按预期工作。这强烈暗示CUDA 7.5的ptxas
组件的展开器中存在一个错误,但是已经修复了该错误,因为CUDA驱动程序中的ptxas
组件比{ptxas
组件更新。 1}} CUDA 7.5工具链的组件。
将#pragma unroll 4
直接放在循环前面也可以解决问题,因为在这种情况下,展开是由编译器的nvvm
组件执行的,这意味着展开的循环已经出现在PTX级别:
#if ENABLE_BUG
#pragma unroll 4
for (int i = idx; i < MAX_INDEX; i += 32)
thread_sum += data[i];
#else
产生的PTX:
BB7_5:
.pragma "nounroll";
ld.global.u32 %r34, [%rd14];
add.s32 %r35, %r34, %r45;
ld.global.u32 %r36, [%rd14+128];
add.s32 %r37, %r36, %r35;
ld.global.u32 %r38, [%rd14+256];
add.s32 %r39, %r38, %r37;
ld.global.u32 %r40, [%rd14+384];
add.s32 %r45, %r40, %r39;
add.s64 %rd14, %rd14, 512;
add.s32 %r44, %r44, 128;
setp.lt.s32 %p5, %r44, %r3;
@%p5 bra BB7_5;