在cuda __global__函数内打印会杀死该线程

时间:2015-09-02 20:20:42

标签: cuda

我的代码遇到了一个奇怪的问题。如果我尝试在线程中打印某个变量的值,则不会将任何内容写入屏幕,并且所有线程都会在该点停止。这是代码:

    #define WINSIZE 1
    const int nebsize=(WINSIZE*2+1)*(WINSIZE*2+1);

    __global__ void loop(double *img, int *consts, int w, int h, double epsilon){

        int ind=blockIdx.x*blockDim.x+threadIdx.x;
        if(ind<w*h && !consts[ind] && ind%w>=WINSIZE && ind%w<w-WINSIZE && ind/w>=WINSIZE && ind/w<h-WINSIZE){
            int win_inds[nebsize];
            double winI[3*(2*WINSIZE+1)*(2*WINSIZE+1)];
            double winI_re_aux[3*nebsize];
            double pre_win_var[9];
            double win_var[9];
            double win_mu[3];
            double tvals[nebsize*nebsize];
            double detwin;
            int min_i=ind%w-WINSIZE;
            int max_i=ind%w+WINSIZE;
            int min_j=ind/w-WINSIZE;
            int max_j=ind/w+WINSIZE;
            int k;
            int l;
            k=0;        
            for(int i=min_i; i<=max_i; i++){
                for(int j=min_j; j<=max_j; j++){
                    win_inds[k]=h*i+j;
                    k++;
                }
            }
            k=0;
            for(int j=min_j; j<=max_j; j++){        
                l=0;
                for(int i=min_i; i<=max_i; i++){
                    winI[3*(l*(2*WINSIZE+1)+k)]=img[3*(j*w+i)];
                    winI[3*(l*(2*WINSIZE+1)+k)+1]=img[3*(j*w+i)+1];
                    winI[3*(l*(2*WINSIZE+1)+k)+2]=img[3*(j*w+i)+2];
                    l++;
                }
                k++;
            }

            win_mu[0]=0;
            win_mu[1]=0;
            win_mu[2]=0;    
            for(int i=0; i<nebsize; i++){
                win_mu[0]+=winI[3*i];
                win_mu[1]+=winI[3*i+1];
                win_mu[2]+=winI[3*i+2];
            }
            win_mu[0]=win_mu[0]/(double)nebsize;
            win_mu[1]=win_mu[1]/(double)nebsize;
            win_mu[2]=win_mu[2]/(double)nebsize;
            //all ok here

            //this works here
            if(ind==200){   
                    printf("%f\n", win_var[8]);
            }

            for(int i=0; i<3; i++){
                for(int j=0; j<3; j++){
                    pre_win_var[3*i+j]=0;
                    for(int n=0; n<nebsize; n++){
                        pre_win_var[3*i+j]+=winI[3*n+i]*winI[3*n+j];
                    }
                    pre_win_var[3*i+j]=pre_win_var[3*i+j]/(double)nebsize;
                    pre_win_var[3*i+j]+=(i==j)*epsilon/(double)nebsize-win_mu[j]*win_mu[i];
                }
            }
            //this kills all threads          
            if(ind==200){   
                    printf("%f\n", win_var[8]);
            }
            detwin=pre_win_var[0]*pre_win_var[4]*pre_win_var[8]+pre_win_var[2]*pre_win_var[3]*pre_win_var[7]+pre_win_var[1]*pre_win_var[5]*pre_win_var[6];
            detwin-=pre_win_var[6]*pre_win_var[4]*pre_win_var[2]+pre_win_var[3]*pre_win_var[1]*pre_win_var[8]+pre_win_var[7]*pre_win_var[5]*pre_win_var[0];

            win_var[0]=(pre_win_var[4]*pre_win_var[8]-pre_win_var[5]*pre_win_var[7])/detwin;
            win_var[3]=-(pre_win_var[3]*pre_win_var[8]-pre_win_var[5]*pre_win_var[6])/detwin;
            win_var[6]=(pre_win_var[3]*pre_win_var[7]-pre_win_var[4]*pre_win_var[6])/detwin;
            win_var[1]=-(pre_win_var[1]*pre_win_var[8]-pre_win_var[2]*pre_win_var[7])/detwin;
            win_var[4]=(pre_win_var[0]*pre_win_var[8]-pre_win_var[2]*pre_win_var[6])/detwin;
            win_var[7]=-(pre_win_var[0]*pre_win_var[7]-pre_win_var[1]*pre_win_var[6])/detwin;
            win_var[2]=(pre_win_var[1]*pre_win_var[5]-pre_win_var[2]*pre_win_var[4])/detwin;
            win_var[5]=-(pre_win_var[0]*pre_win_var[5]-pre_win_var[2]*pre_win_var[3])/detwin;
            win_var[8]=(pre_win_var[0]*pre_win_var[4]-pre_win_var[1]*pre_win_var[3])/detwin;                

            //this line gets executed in all threads if I printf nothing
            consts[ind]=666;

        }
    }

只有在计算值之前才可以打印win_var或pre_win_var的值,但是如果我尝试打印它们之后它似乎会杀死所有线程。如果我没有打印任何线条consts [ind] = 666在所有线程中执行,我知道它,因为我可以将consts复制回主机内存并打印它。那么,任何人都不知道出了什么问题?

1 个答案:

答案 0 :(得分:1)

问题似乎是资源枯竭。由于包含ABI调用的内核寄存器占用空间较大,因此启动cudaErrorLaunchOutOfResources时启动printf

您没有提供有关启动参数的任何详细信息,但是将每个块的总线程数减少到32的较小倍数可以解决问题。