我的代码遇到了一个奇怪的问题。如果我尝试在线程中打印某个变量的值,则不会将任何内容写入屏幕,并且所有线程都会在该点停止。这是代码:
#define WINSIZE 1
const int nebsize=(WINSIZE*2+1)*(WINSIZE*2+1);
__global__ void loop(double *img, int *consts, int w, int h, double epsilon){
int ind=blockIdx.x*blockDim.x+threadIdx.x;
if(ind<w*h && !consts[ind] && ind%w>=WINSIZE && ind%w<w-WINSIZE && ind/w>=WINSIZE && ind/w<h-WINSIZE){
int win_inds[nebsize];
double winI[3*(2*WINSIZE+1)*(2*WINSIZE+1)];
double winI_re_aux[3*nebsize];
double pre_win_var[9];
double win_var[9];
double win_mu[3];
double tvals[nebsize*nebsize];
double detwin;
int min_i=ind%w-WINSIZE;
int max_i=ind%w+WINSIZE;
int min_j=ind/w-WINSIZE;
int max_j=ind/w+WINSIZE;
int k;
int l;
k=0;
for(int i=min_i; i<=max_i; i++){
for(int j=min_j; j<=max_j; j++){
win_inds[k]=h*i+j;
k++;
}
}
k=0;
for(int j=min_j; j<=max_j; j++){
l=0;
for(int i=min_i; i<=max_i; i++){
winI[3*(l*(2*WINSIZE+1)+k)]=img[3*(j*w+i)];
winI[3*(l*(2*WINSIZE+1)+k)+1]=img[3*(j*w+i)+1];
winI[3*(l*(2*WINSIZE+1)+k)+2]=img[3*(j*w+i)+2];
l++;
}
k++;
}
win_mu[0]=0;
win_mu[1]=0;
win_mu[2]=0;
for(int i=0; i<nebsize; i++){
win_mu[0]+=winI[3*i];
win_mu[1]+=winI[3*i+1];
win_mu[2]+=winI[3*i+2];
}
win_mu[0]=win_mu[0]/(double)nebsize;
win_mu[1]=win_mu[1]/(double)nebsize;
win_mu[2]=win_mu[2]/(double)nebsize;
//all ok here
//this works here
if(ind==200){
printf("%f\n", win_var[8]);
}
for(int i=0; i<3; i++){
for(int j=0; j<3; j++){
pre_win_var[3*i+j]=0;
for(int n=0; n<nebsize; n++){
pre_win_var[3*i+j]+=winI[3*n+i]*winI[3*n+j];
}
pre_win_var[3*i+j]=pre_win_var[3*i+j]/(double)nebsize;
pre_win_var[3*i+j]+=(i==j)*epsilon/(double)nebsize-win_mu[j]*win_mu[i];
}
}
//this kills all threads
if(ind==200){
printf("%f\n", win_var[8]);
}
detwin=pre_win_var[0]*pre_win_var[4]*pre_win_var[8]+pre_win_var[2]*pre_win_var[3]*pre_win_var[7]+pre_win_var[1]*pre_win_var[5]*pre_win_var[6];
detwin-=pre_win_var[6]*pre_win_var[4]*pre_win_var[2]+pre_win_var[3]*pre_win_var[1]*pre_win_var[8]+pre_win_var[7]*pre_win_var[5]*pre_win_var[0];
win_var[0]=(pre_win_var[4]*pre_win_var[8]-pre_win_var[5]*pre_win_var[7])/detwin;
win_var[3]=-(pre_win_var[3]*pre_win_var[8]-pre_win_var[5]*pre_win_var[6])/detwin;
win_var[6]=(pre_win_var[3]*pre_win_var[7]-pre_win_var[4]*pre_win_var[6])/detwin;
win_var[1]=-(pre_win_var[1]*pre_win_var[8]-pre_win_var[2]*pre_win_var[7])/detwin;
win_var[4]=(pre_win_var[0]*pre_win_var[8]-pre_win_var[2]*pre_win_var[6])/detwin;
win_var[7]=-(pre_win_var[0]*pre_win_var[7]-pre_win_var[1]*pre_win_var[6])/detwin;
win_var[2]=(pre_win_var[1]*pre_win_var[5]-pre_win_var[2]*pre_win_var[4])/detwin;
win_var[5]=-(pre_win_var[0]*pre_win_var[5]-pre_win_var[2]*pre_win_var[3])/detwin;
win_var[8]=(pre_win_var[0]*pre_win_var[4]-pre_win_var[1]*pre_win_var[3])/detwin;
//this line gets executed in all threads if I printf nothing
consts[ind]=666;
}
}
只有在计算值之前才可以打印win_var或pre_win_var的值,但是如果我尝试打印它们之后它似乎会杀死所有线程。如果我没有打印任何线条consts [ind] = 666在所有线程中执行,我知道它,因为我可以将consts复制回主机内存并打印它。那么,任何人都不知道出了什么问题?
答案 0 :(得分:1)
问题似乎是资源枯竭。由于包含ABI调用的内核寄存器占用空间较大,因此启动cudaErrorLaunchOutOfResources
时启动printf
。
您没有提供有关启动参数的任何详细信息,但是将每个块的总线程数减少到32的较小倍数可以解决问题。