使用cudaMemcpy3D传输***指针

时间:2014-04-26 11:57:36

标签: cuda

我正在尝试使用cudaMemcpy3D来传输动态分配的3d矩阵(张量)。 Tensor被分配为连续的内存块(参见下面的代码)。我尝试了cudaExtentcudaMemcpy3DParms的各种组合,但元素的顺序混淆了。我创建了以下示例来演示此问题:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
            tensorrow = (int *)(tensorslice + i * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
         }
      }
   }   
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nx, Ny, Nz);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);

   // ... clean-up
}

主变量(h_tensor)和设备(d_tensor)的输出不同,看起来像

h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...

d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...

我做错了什么?使用cudaMemcpy3D的正确方法是什么?

1 个答案:

答案 0 :(得分:3)

  1. 如果您遇到cuda代码时遇到问题,那么proper cuda error checking是个好主意。您在此处发布的代码至少不能正确运行 - cudaMemcpy3D行会引发错误。这是由于下面的第2项。 (我怀疑用于生成输出的代码与您在此处显示的代码不同,但这只是猜测。)
  2. 您对make_cudaPitchedPtr的使用不正确:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
    

    查看API文档。以这种方式制作CUDA倾斜指针在2D和3D之间没有区别。因此,正如您所做的那样,传递3个不同的维度是没有意义的。而是这样做:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
    
  3. 我发现的其余问题归因于对C中3维的错误理解。乘法下标数组的最后一个下标是快速变化的维度,即它是内存中相邻值占据相邻索引的维度值。由于这个原因,你在第三维中使用Z会让我感到困惑。您的主机分配在第一个下标位置使用Nx,但您的设备索引不匹配。显然有多种方法可以解决这个问题。如果您不喜欢我的安排,您可以更改它,但主机和设备索引必须匹配。

  4. 无论如何,以下代码修改对我有用:

    #include <stdio.h>
    
    int ***alloc_tensor(int Nx, int Ny, int Nz) {
       int i, j;
       int ***tensor;
    
       tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
       tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
       tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
    
       for(j = 1; j < Ny; j++)
          tensor[0][j] = tensor[0][j-1] + Nz;
       for(i = 1; i < Nx; i++) {
          tensor[i] = tensor[i - 1] + Ny;
          tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
          for(j = 1; j < Ny; j++)
             tensor[i][j] = tensor[i][j - 1] + Nz;
       }
    
       return tensor;
    }
    
    __global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
       int i, j, k;
       char *tensorslice;
       int *tensorrow;
    
       for (i = 0; i < Nx; i++) {
          for (j = 0; j < Ny; j++) {
             for (k = 0; k < Nz; k++) {
                tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
                tensorrow = (int *)(tensorslice + j * tensor.pitch);
                printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
             }
          }
       }
    }
    
    int main() {
       int i, j, k, value = 0;
       int Nx = 2, Ny = 6, Nz = 4;
    
       int ***h_tensor;
       struct cudaPitchedPtr d_tensor;
    
       h_tensor = alloc_tensor(Nz, Ny, Nx);
       cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
    
       for(i = 0; i < Nx; i++) {
          for(j = 0; j < Ny; j++) {
             for(k = 0; k < Nz; k++) {
                h_tensor[k][j][i] = value++;
                //printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
             }
          }
       }
       for(i = 0; i < Nx; i++) {
          for(j = 0; j < Ny; j++) {
             for(k = 0; k < Nz; k++) {
                //h_tensor[i][j][k] = value++;
                printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
             }
          }
       }
    
       cudaMemcpy3DParms cpy = { 0 };
       cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
       cpy.dstPtr = d_tensor;
       cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
       cpy.kind = cudaMemcpyHostToDevice;
    
       cudaMemcpy3D(&cpy);
    
       kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
       cudaDeviceSynchronize();
       // ... clean-up
    }