CUDA_SAFE_CALL:遇到非法内存访问

时间:2017-04-22 00:56:30

标签: cuda

我正在尝试在CUDA上进行简单的矩阵乘法。我知道可以将数组展平以将其传递给设备。但是我使用cudaMallocPitch和cudaMemcpy2d进行乘法运算。执行下面的代码时出现错误"遇到非法记忆"当我尝试将结果复制到主机上时,我非常感谢任何有关我出错的建议。谢谢!

权重 - 第一矩阵,暗淡:30x784

输入 - 第二个矩阵,暗淡:784x100

results_d - 设备上的结果(GPU)

结果 - 在主机上复制结果

#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>

#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch)
{
int row = threadIdx.x;
int col=  threadIdx.y;
double value;
double *result_matrix;

result_matrix = ((double*)((char*)results_d + row*result_pitch + col));


printf("%d",threadIdx);

for(int i =0 ; i < in_pitch ; i++)

{

double *element1 = ((double*)((char*)input + row*in_pitch) + i) ;
double *element2 =   ((double*)((char*)weights + i*w1_pitch) + col);

value =+ (*element1) * (*element2);

}

*result_matrix = value;

}





int main()
{

static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];



for (int i = 0 ; i < 30; i++)

{
for(int j =0;j <784 ; j ++)
arr1[i][j] = 5;

}

for (int i =0 ; i < 784; i ++)
{

for(int j=0;j < 100 ; j++)
arr2[i][j] = 3;

}



double *input;
double *weights;
double *results_d;

size_t in_pitch,w1_pitch,result_pitch;



//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30));

//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice));


//using GPU


    dim3 dimGrid(1,1,1);
    dim3 dimBlock(32,32,1);
    printf("before kernel fucntion");   
    MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);    
    printf("after kernel fucntion");
    cudaThreadSynchronize();

//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost));


//printing and seeing whether the result matrix has been updated     
for (int i =0 ; i < 100; i ++)
{

for(int j=0;j < 30 ; j++)
{
printf("%f",result);

}
printf("\n");
}

CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));


return 0;
}

1 个答案:

答案 0 :(得分:1)

此代码中存在许多错误。首先,不清楚做分配是否会给这里带来任何好处。其次,如果你真的想要快速矩阵乘法性能,你应该使用CUBLAS。

的问题:

  1. 您似乎不理解投放的分配。返回的pitch值是 bytes 中的值。您不能明智地将其用于矩阵乘法的循环索引。此外,pitch值是音调分配的总宽度。它与有效数据区域不对应。为此,您应该使用适当的矩阵维度。

  2. 您的代码不会对整个矩阵区域进行矩阵乘法。您只创建一个32x32线程的块,但是您需要足够的块/线程来覆盖整个矩阵区域。这需要更改网格维度,将矩阵维度传递给内核,以及内核中的“线程检查”以防止越界访问。

  3. 这种用于投放访问的结构不正确:

    result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
    

    它与2个输入矩阵的其他结构不匹配,它有一个错位的右括号。

  4. 您可以感觉到两个输入矩阵相反。您正在将input矩阵编入索引,就像它是weight矩阵一样,反之亦然。我们需要交换rowcolumni的含义,使这些符合实际的矩阵维度。

  5. 您的最终cudaMemcpy2D操作的音高值反转:

  6. cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)

                        ^^^^^                   ^^^^^
    
    1. 您忘记将循环和变量初始化为零:

      double value;
      
    2. 我不知道您的意图,+=而不是=+

      value =+ ...
      
    3. 以下代码解决了这些问题,似乎对我没有错误运行:

      $ cat t104.cu
      #include <stdio.h>
      #include <math.h>
      #include <cstdio>
      #include <cstdlib>
      
      const int d1 = 30;
      const int d2 = 784;
      const int d3 = 100;
      
      double arr1[d1][d2];
      double arr2[d2][d3];
      double result[d1][d3];
      
      
      #define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
      
      inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
      {
          if (code != cudaSuccess)
          {
              fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
              if (abort) exit(code);
          }
      }
      
      __global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
      {
        int col = threadIdx.x + blockDim.x*blockIdx.x;
        int row=  threadIdx.y + blockDim.y*blockIdx.y;
      
        if ((row >= rrow) || (col >= rcol)) return;
      
        double value = 0;
        double *result_matrix;
      
        result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
      
        for(int i =0 ; i < dim ; i++)
      
        {
      
          double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
          double *element2 =   ((double*)((char*)weights + row*w1_pitch) + i);
      
          value += (*element1) * (*element2);
      
        }
      
        *result_matrix = value;
      
      }
      
      
      
      
      
      int main()
      {
      
      
        for (int i = 0 ; i < d1; i++)
      
        {
          for(int j =0;j <d2 ; j ++)
            arr1[i][j] = 5;
      
        }
      
        for (int i =0 ; i < d2; i ++)
        {
      
          for(int j=0;j < d3 ; j++)
            arr2[i][j] = 3;
      
        }
      
      
      
        double *input;
        double *weights;
        double *results_d;
      
        size_t in_pitch,w1_pitch,result_pitch;
      
      
      
      //allocating memory in GPU for 2 inputs and result
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
      
      //Copy matrix from host to device
        CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
      
      
      //using GPU
      
      
        dim3 dimBlock(32,32,1);
        dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
        MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
      
      //copying back to host
        CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
      
      
      //printing and seeing whether the result matrix has been updated
        for (int i =0 ; i < d3; i ++)
        {
      
          for(int j=0;j < d1 ; j++)
          {
            printf("%f", result[j][i]);
      
          }
          printf("\n");
        }
      
        CUDA_SAFE_CALL(cudaFree(input));
        CUDA_SAFE_CALL(cudaFree(weights));
        CUDA_SAFE_CALL(cudaFree(results_d));
      
      
        return 0;
      }
      $ nvcc -arch=sm_61 -o t104 t104.cu
      $