我的CUDA内核代码不起作用

时间:2015-04-24 23:38:05

标签: cuda nsight

我尝试制作一个小代码来生成数字并将结果返回到数组中但是一旦我运行此代码它就无法工作,我尝试使用Nsight调试器来了解我的问题在哪里,但它会立即冻结并关闭。 / p>

请您帮我解释一下这段代码中的问题在哪里?

__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp, 
                              int m[2], int p[5], int q[5], int i, int* n, 
                              int out[10][5], int N)
    {
        int id = blockDim.x * blockIdx.x + threadIdx.x;

        int idx = blockIdx.x;
        int idy = blockIdx.y;

        int w = idx/100;
        int x = idx%100;
        int y = idy;

        int z = threadIdx.x;

        int len = ((i * 2) + 5);


        // Fill PF_tmp & QF_tmp
        if( i > 0){
            for(int k = 0; k < (i * 2); k++)
            {
                p[k]   = PF_tmp[k];
                q[k]   = QF_tmp[k];
            }
        }

        // Fill X
        if( x > 10)
        {
            p[(i*2)] = (x - (x % 10)) / 10;
            p[(i*2)+1] = x % 10;
        }else{
            p[(i*2)] = 0;
            p[(i*2)+1] = x;
        }

        // Fill Y
        if( y > 10)
        {
            q[(i*2)] = (y - (y % 10)) / 10;
            q[(i*2)+1] = y % 10;
        }else{
            q[(i*2)] = 0;
            q[(i*2)+1] = y;
        }

        // Fill m
        p[(i * 2)+2] = m[0];
        q[(i * 2)+2] = m[1];

        // Fill W 
        if( w > 10)
        {
            p[(i*2)+3] = (w - (w % 10)) / 10;
            p[(i*2)+4] = w % 10;
        }else{
            p[(i*2)+3] = 0;
            p[(i*2)+4] = w;
        }

        // Fill Z 
        if( z > 10)
        {
            q[(i*2)+3] = (z - (z % 10)) / 10;
            q[(i*2)+4] = z % 10;
        }else{
            q[(i*2)+3] = 0;
            q[(i*2)+4] = z;
        }

        // Fill PL_tmp & QL_tmp
        if( i > 0)
        {
            for(int k = 0; k < (i * 2); k++)
            {
                p[(len-(i * 2))+k]   = PL_tmp[k];
                q[(len-(i * 2))+k]   = QL_tmp[k];
            }
        }

        if(id<10)
        {
            for(int k =0; k<5; k++)
                out[id][k] = p[k];
        }

    }



    int main()
    {
        cudaError err;
        dim3 blocks(10000, 100);
        dim3 threads(100); 

        int m[2] = {4,5};
        int hst_out[10][5];
        int p[5];
        int q[5];
        err = cudaMalloc((void **)&p, 5);
        err = cudaMalloc((void **)&q, 5);
        err = cudaMalloc((void **)&hst_out, 50);

        mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);

        return 0;
    }

1 个答案:

答案 0 :(得分:1)

错误非常明显,它全是C编程。

宣布

        int m[2] = {4,5};
        int hst_out[10][5];
        int p[5];
        int q[5];

现在hst_out,p,q不是指针,但后来它被用作指针:

        err = cudaMalloc((void **)&p, 5);
        err = cudaMalloc((void **)&q, 5);
        err = cudaMalloc((void **)&hst_out, 50);

所以你应该最初将它声明为指针,例如,

        int *p;

并以这种方式使用它:

        err = cudaMalloc((void **)&p, 5*sizeof(int));

另请注意,您声明的大小只有5个字节....而我将其声明为5 * sizeof(int)。

有关更多示例,请参阅:

http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html