任何人都可以告诉我CUDA内核中的以下代码有什么问题:
//code from the kernel:
__shared__ float data[32];
__shared__ float location[64];
__shared__ unsigned char signs[64][32];
__shared__ int pages[32];
if (threadIdx.y==0)
{
for (int i=0; i<8; ++i)
{
signs[i][threadIdx.x]=0;//here threadIdx.x<=31
// Debugger tell me this next line causes the access violation
signs[i+8][threadIdx.x]=0;
};
};
CUDA调试器给出了以下错误消息:
CUDA Memory Checker检测到32个线程导致访问冲突:
内存检查器检测到32次访问冲突。
错误= 未对齐商店(共享内存)
CUDA(CUDA 5.0)是否支持char-wise数学运算(+和 - ),就像普通的C一样?
调试器还为我提供了一些共享内存中未对齐的负载:
signs[32+threadIdx.y*2][threadIdx.x]+=signs[33+threadIdx.y*2][threadIdx.x];
内存检查器检测到32次访问冲突。
错误= 未对齐的负载(共享内存)
以上所有代码的线程块大小为: nx = 32,ny = 4 ;
gridDim = {2000,1,1}
blockDim = {32,4,1}
sharedSize = 2560
更新:以下代码可以重现我计算机上的内存错误:CUDA 5.0,VS2010
static __global__ void test(int signal)
{
__shared__ float data[32];
__shared__ float locations[64];
__shared__ unsigned char signs[64][32];
__shared__ int pages[32];
if (signal>0) {
int idx=threadIdx.x;
int idy=threadIdx.y;
for (int s1=0; s1<99; s1++) {
for (int s2=s1+1; s2<100; s2++) {
switch (idy)
{
case 1:
for (int i=16; i<24; ++i) {
signs[i][idx]=0;
};
break;
case 2:
for (int i=24; i<32; ++i) {
signs[i][idx]=0;
};
break;
};
__syncthreads();
};
};
};
};
启动:
int main()
{
int xrow=2000;
dim3 thread2D(32,4,1);
test<<<xrow, thread2D>>>(1);
cudaDeviceSynchronize();
system("pause");
return 0;
};
试验台:
CUDA 5.0; Nsight 3.0.0.13027 (RC1); Visual Studio 2010; Build options: 64 bit codes, sm 3.0 and 3.5 (both can reproduce the bug); OS: Win7