我正在尝试执行运行gabor过滤器的内核,我收到此错误
/Gabor_Cuda/gaborMax.cu(2387) : getLastCudaError() CUDA error : convolutionColumnGaborMaxGPU() execution failed : (8) invalid device function
这是“gaborMax.cu”文件的“2387”行:
getLastCudaError("convolutionColumnGaborMaxGPU() execution failed\n");
它引用的内核就是这个:
convolutionGaborMaxGPU<<<blockGridColumns, threadBlockColumns>>>(d_Input, d_Result0, d_Result1,d_Result2,d_Result3,d_Result4,d_Result5,d_Result6,d_Result7,d_Result8,d_Result9,d_Result10,d_Result11,d_Result12,d_Result13,d_Result14,d_Result15,DATA_W,DATA_H, loadsPerThread,loadsPerThread);
我在徘徊,如果我应该发布或不发布内核的代码,因为它是一个更加可靠的1500代码行,我认为发布文件“gaborMax.cu”会更好,但无论如何这是代码内核
__global__ void convolutionGaborMaxGPU(
float *d_Input,
float *d_Result0,
float *d_Result1,
float *d_Result2,
float *d_Result3,
float *d_Result4,
float *d_Result5,
float *d_Result6,
float *d_Result7,
float *d_Result8,
float *d_Result9,
float *d_Result10,
float *d_Result11,
float *d_Result12,
float *d_Result13,
float *d_Result14,
float *d_Result15,
int dataW,
int dataH,
int loadsPerThreadX,
int loadsPerThreadY
){
const int smemSize = SUBPICW * SUBPICW;
const int smemYOffset = IMUL(threadIdx.y, SUBPICW);
const int smemYBlockOffset = IMUL(blockDim.y, SUBPICW);
const int yOffset = IMUL(threadIdx.y, dataW);
const int localYBlockOffset = IMUL(blockDim.y, dataW);
const int globalYBlockOffset = IMUL(blockIdx.y, blockDim.y * dataW );
const int xBlockOffset = IMUL(blockIdx.x, blockDim.x);
//const int apronOffset = (APRON0 * dataW) - APRON0;
__shared__ float data[SUBPICW*SUBPICW];
int currentXIdx = 0;
int smemPos = 0;
int smemPosData = 0;
int gmemPos = 0;
int gmemPosData = 0;
for (int k = 0; k < loadsPerThreadY; k++)
{
for (int l = 0; l < loadsPerThreadX; l++)
{
currentXIdx = threadIdx.x + (l*blockDim.x);
if (currentXIdx < SUBPICW)
{
smemPos = currentXIdx + smemYOffset + (k * smemYBlockOffset);
if (smemPos < smemSize)
{
gmemPos = currentXIdx + xBlockOffset;
if (gmemPos - APRON0 >= dataW)
{
gmemPos = dataW + APRON0;
}
else if (gmemPos < APRON0) {
gmemPos = APRON0;
}
gmemPos+= (yOffset + globalYBlockOffset + (k * localYBlockOffset) - (APRON0 * dataW) - APRON0);
if (gmemPos < APRON0)
{
gmemPos = APRON0;
}
else if (gmemPos >= dataW*dataH)
{
gmemPos = dataW*dataH - 1;
}
data[smemPos] = d_Input[gmemPos];
}
}
}
}
__syncthreads();
smemPosData = threadIdx.x + smemYOffset + APRON0 + (APRON0 * SUBPICW);
//smemPosData = threadIdx.x + ((threadIdx.y) * SUBPICW);
gmemPosData = threadIdx.x + xBlockOffset
+ yOffset + globalYBlockOffset;
/////////////////////////////////////////////////////////////////////////////////calculate 1st filter convolution
float sum0 = 0;
#ifdef UNROLL_INNER
sum0 = convolutionGaborMax18<2 * KERNEL_RADIUS0>(data + smemPosData, d_Kernel0);
#else
for (int k = -KERNEL_RADIUS0; k <= KERNEL_RADIUS0; k++) {
sum0 += data[smemPosData -18 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -18)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -17 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -17)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -16 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -16)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -15 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -15)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -14 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -14)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -13 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -13)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -12 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -12)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -11 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -11)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -10 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -10)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -9 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -9)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -8 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -8)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -7 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -7)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -6 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -6)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -5 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -5)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -4 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -4)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -3 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -3)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -2 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -2)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData -1 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 -1)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +1 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +1)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +2 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +2)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +3 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +3)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +4 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +4)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +5 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +5)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +6 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +6)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +7 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +7)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +8 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +8)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +9 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +9)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +10 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +10)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +11 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +11)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +12 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +12)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +13 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +13)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +14 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +14)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +15 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +15)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +16 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +16)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +17 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +17)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
sum0 += data[smemPosData +18 + (k*SUBPICW)]
* d_Kernel0[(KERNEL_RADIUS0 +18)
+ ((KERNEL_RADIUS0 + k)*KERNEL_W0)];
}
#endif
sum0 /= 18;
d_Result0[gmemPosData] = sum0;
d_Result1[gmemPosData] = sum1;
d_Result2[gmemPosData] = sum2;
d_Result3[gmemPosData] = sum3;
d_Result4[gmemPosData] = sum4;
d_Result5[gmemPosData] = sum5;
d_Result6[gmemPosData] = sum6;
d_Result7[gmemPosData] = sum7;
d_Result8[gmemPosData] = sum8;
d_Result9[gmemPosData] = sum9;
d_Result10[gmemPosData] = sum10;
d_Result11[gmemPosData] = sum11;
d_Result12[gmemPosData] = sum12;
d_Result13[gmemPosData] = sum13;
d_Result14[gmemPosData] = sum14;
d_Result15[gmemPosData] = sum15;
}
我对所有其他15个滤波器卷积进行相同的计算,我从中得到 sum1,.....,sum15
我在32位上运行我的代码,不知道代码是否需要在64位计算机上执行,但我不明白这个错误的含义。
答案 0 :(得分:1)
我知道这个问题已经过时了,但这个答案可能会帮助那些拥有相同类型错误信息的人。我建议您检查您正在编译cuda代码的架构,即
-gencode arch=compute_20,code=sm_20
尝试从CUDA示例运行deviceQuery以确保正确设置。