Question

我正在尝试使用cudaMalloc3D将扁平2D阵列的内存分配并复制到设备上，以测试cudaMalloc3D的性能。但是当我尝试从内核写入数组时，它会抛出'遇到非法内存访问'异常。如果我只是从数组中读取，但是当我尝试写入它时，程序运行正常，则会出错。任何有关这方面的帮助将不胜感激。下面是我的代码和编译代码的语法。

使用

进行编译

nvcc -O2 -arch sm_20 test.cu

代码：test.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265 
#define NX 8192     /* includes boundary points on both end */
#define NY 4096     /* includes boundary points on both end */
#define NZ 1        /* needed for cudaMalloc3D */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X 
#define N_BLOCKS_Y NY/N_THREADS_Y 

#define LX 4.0    /* length of the domain in x-direction  */
#define LY 2.0    /* length of the domain in x-direction  */
#define dx       (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd     5.0
#define dt       (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR   ( cSqrd * (dt*dt)/(dx*dx) )

#define IC  (i + j*NX)       /* (i,j)   */
#define IM1 (i + j*NX - 1)   /* (i-1,j) */
#define IP1 (i + j*NX + 1)   /* (i+1,j) */
#define JM1 (i + (j-1)*NX)   /* (i,j-1) */
#define JP1 (i + (j+1)*NX)   /* (i,j+1) */


// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
  cudaError_t e = cudaGetLastError();\
  if( e != cudaSuccess ) {\
    printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
    exit(EXIT_FAILURE);\
  }\
}

typedef double REAL;
typedef int    INT;


void meshGrid ( REAL *x, REAL *y )
{

  INT i,j;
  REAL a;
  for (j=0; j<NY; j++) {
    a = dx * ( (REAL) j );
    for (i=0; i<NX; i++) {
      x[IC] =  dx * ( (REAL) i );
      y[IC] = a;
    }
  }
}


void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{                    
  INT i,j;
  for (j=1; j<NY-1; j++) {
    for (i=1; i<NX-1; i++) {
      u[IC] =  0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
    }
  }

  for (j=1; j<NY-1; j++) {
    for (i=1; i<NX-1; i++) {
      uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
    }
  }
}


__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{

 INT i,j;

 i = blockIdx.x*blockDim.x + threadIdx.x;
 j = blockIdx.y*blockDim.y + threadIdx.y;

 if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

  char *unewPtr  = (char *) unew.ptr;
  REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);

  REAL tmp = unew_row[j]; // no error on this line
  unew_row[j] = 1.2; // this is where I get the error
 }

}


INT main(INT argc, char *argv[])
{

  INT nTimeSteps = 10;  

  // pointers for the host side
  REAL *unew, *u, *uold, *uFinal, *x, *y;

  // allocate memory on the host
  unew        = (REAL *)calloc(NX*NY,sizeof(REAL));
  u           = (REAL *)calloc(NX*NY,sizeof(REAL));
  uold        = (REAL *)calloc(NX*NY,sizeof(REAL));
  uFinal      = (REAL *)calloc(NX*NY,sizeof(REAL));
  x           = (REAL *)calloc(NX*NY,sizeof(REAL));
  y           = (REAL *)calloc(NX*NY,sizeof(REAL));


  // pointer for the device side
  size_t pitch = NX * sizeof(REAL);
  cudaPitchedPtr  d_u, d_uold, d_unew, d_tmp;
  cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);

  // allocate 3D memory on the device
  cudaMalloc3D( &d_u, myExtent );    cudaCheckError();
  cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
  cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();


  // initialize grid and wave
  meshGrid( x, y );
  initWave( u, uold, x, y );


  // copy host memory to 3D device memory
  cudaMemcpy3DParms cpy3D = { 0 };
  cpy3D.kind = cudaMemcpyHostToDevice;

  // copying u to d_u
  cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
  cpy3D.dstPtr = d_u;
  cpy3D.extent = myExtent;
  cudaMemcpy3D( &cpy3D ); cudaCheckError();  

  // copying uold to d_uold
  cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
  cpy3D.dstPtr = d_uold;
  cpy3D.extent = myExtent;
  cudaMemcpy3D( &cpy3D ); cudaCheckError();  


  //  set up the GPU grid/block model
  dim3 dimGrid  ( N_BLOCKS_X , N_BLOCKS_Y  );
  dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );

  for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
    solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
    cudaThreadSynchronize();
    cudaCheckError();

    d_tmp  = d_uold;
    d_uold = d_u;
    d_u    = d_unew;
    d_unew = d_tmp;
  }

  // copy the memory back to host
  cpy3D.kind = cudaMemcpyDeviceToHost;

  // copying d_unew to uFinal
  cpy3D.srcPtr = d_unew;
  cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
  cpy3D.extent = myExtent;
  cudaMemcpy3D( &cpy3D ); cudaCheckError();  

  free(u);    cudaFree(d_u.ptr);
  free(unew); cudaFree(d_unew.ptr);
  free(uold); cudaFree(d_uold.ptr);

  free(uFinal); free(x); free(y);

  return EXIT_SUCCESS;
}

Answer 1

此行未发生错误的原因：

REAL tmp = unew_row[j]; // no error on this line

是因为编译器正在优化该行。它没有做任何有用的事情，因此编译器完全消除了它。编译器警告：

xxx.cu(87): warning: variable "tmp" was declared but never referenced

暗示了这种效果。

您的代码非常接近正确。问题出在这里：

REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);

应该是：

REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);

内核中的i变量是 width （即X）维度。 j变量是 height （即Y）维度。

高度是指您所在的行，因此行间距应乘以高度参数，即j，而不是i。

同样，虽然它不是特定维度的特定失败的来源，但此代码可能不是您想要的：

REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error

例如，如果您打算计算行的偏移量然后索引到行（例如，可能要设置位置中的每个元素），那么我认为您会想要使用i不是j作为您的最终索引：

REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error

但是，对于此特定示例，这不是非法内存访问的实际来源。

尝试写入使用cudaMalloc3D分配的2D数组时，“非法内存访问”

1 个答案: