Question

我有两个uint8_t图像缓冲区，想将它们并排放置在cuda内存中，如下所示：

 ---------------------   ---------------------
|                     | |                     |
|                     | |                     |
|         1           | |         2           |
|                     | |                     |
|                     | |                     |
 ---------------------   ---------------------

使用cudaMemcpy不起作用，因为它会线性填充，而第二个缓冲区会覆盖第一个缓冲区。

cudaArray_t似乎更适合浮点数，但是上面没有很多文档。

任何帮助将不胜感激！谢谢。

Answer 1

这里比较了2种不同的方法。第一种方法使用内核将两个单独的缓冲区“并排”放置在设备内存中，即行交错。

第二种方法使用两个cudaMemcpy2D调用来执行相同的操作：

$ cat t346.cu
#include <iostream>
#ifndef DIM
#define DIM 16
#endif
typedef int mt;

template <typename T>
__global__ void sxs(const T * __restrict__ s1, const T * __restrict__ s2, T * dest, size_t width, size_t height){

  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  size_t sidx = idx;
  while (sidx < width*height){
    size_t mydiv = sidx/width;
    size_t mymod = sidx - (mydiv*width);
    size_t didx = ((mydiv) * 2)*width + mymod;
    size_t didx2 = didx + width;
    dest[didx]  = s1[sidx];
    dest[didx2] = s2[sidx];
    sidx += gridDim.x*blockDim.x;} // grid-stride loop
}

const size_t w = DIM;
const size_t h = DIM;

int main(){

  // data setup
  mt *h_d1, *h_d2, *h_o, *d_d1, *d_d2, *d_o;
  h_d1 = new mt[w*h];
  h_d2 = new mt[w*h];
  h_o  = new mt[w*h*2];
  cudaMalloc(&d_d1,  w*h*sizeof(mt));
  cudaMalloc(&d_d2,  w*h*sizeof(mt));
  cudaMalloc(&d_o, 2*w*h*sizeof(mt));
  for (int i = 0; i < w*h; i++){
    h_d1[i] = 1;
    h_d2[i] = 2;}
  cudaMemcpy(d_d1, h_d1, w*h*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_d2, h_d2, w*h*sizeof(mt), cudaMemcpyHostToDevice);
  // method 1: kernel
  sxs<<<(w*h+511)/512, 512>>>(d_d1, d_d2, d_o, w, h);
  cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
  if (w == 16){
    std::cout << "kernel:" << std::endl;
    int cnt = 0;
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
      std::cout << std::endl;}
    }
  // method 2: cudaMemcpy2D
  cudaMemcpy2D(d_o,   2*w*sizeof(mt), d_d1, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
  cudaMemcpy2D(d_o+w, 2*w*sizeof(mt), d_d2, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
  cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
  if (w == 16){
    std::cout << "cudaMemcpy2D" << std::endl;
    int cnt = 0;
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
      std::cout << std::endl;}
    }
  return 0;
}
$ nvcc -o t346 t346.cu
$ cuda-memcheck ./t346
========= CUDA-MEMCHECK
kernel:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
cudaMemcpy2D
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
========= ERROR SUMMARY: 0 errors
$ nvcc -o t346 t346.cu -DDIM=1024
$ nvprof ./t346
==7903== NVPROF is profiling process 7903, command: ./t346
==7903== Profiling application: ./t346
==7903== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   63.26%  5.6010ms         2  2.8005ms  2.0116ms  3.5894ms  [CUDA memcpy DtoH]
                   35.88%  3.1773ms         2  1.5887ms  1.5809ms  1.5965ms  [CUDA memcpy HtoD]
                    0.45%  39.679us         1  39.679us  39.679us  39.679us  void sxs<int>(int const *, int const *, int*, unsigned long, unsigned long)
                    0.41%  36.224us         2  18.112us  18.080us  18.144us  [CUDA memcpy DtoD]
      API calls:   94.95%  356.93ms         3  118.98ms  290.75us  356.33ms  cudaMalloc
                    2.96%  11.121ms         4  2.7802ms  2.0230ms  4.4443ms  cudaMemcpy
                    1.49%  5.6179ms       384  14.629us     406ns  969.76us  cuDeviceGetAttribute
                    0.43%  1.6087ms         4  402.18us  250.54us  615.60us  cuDeviceTotalMem
                    0.12%  462.90us         4  115.73us  105.58us  134.35us  cuDeviceGetName
                    0.02%  82.153us         2  41.076us  24.136us  58.017us  cudaMemcpy2D
                    0.02%  60.048us         1  60.048us  60.048us  60.048us  cudaLaunchKernel
                    0.01%  24.121us         4  6.0300us  4.1910us  8.5880us  cuDeviceGetPCIBusId
                    0.00%  10.201us         8  1.2750us     534ns  2.7570us  cuDeviceGet
                    0.00%  6.6820us         3  2.2270us     368ns  3.8570us  cuDeviceGetCount
                    0.00%  2.8140us         4     703ns     583ns     844ns  cuDeviceGetUuid
$

我们可以看到，在上述测试案例中，当图像为1024x1024时，内核方法使用大约40微秒，而两个cudamemcpy2D操作合起来使用大约80微秒。

从获得的带宽角度来看，内核正在移动2 * 1024 * 1024 * sizeof（int）字节（读取和写入每个字节）。读取为8MB，写入为8MB，即40us总共16MB = 400,000 MB / s或400GB / s达到的带宽。正如bandwidthTest所示，这恰好在具有约500GB / s带宽的Tesla P100 GPU上。因此，根据此度量，该内核可达到峰值可用带宽的80％。

这个稍有改进的内核版本运行时间大约为34微秒，而不是40秒，在34us = 470GB / s的速度下可生成16MB：

$ cat t346.cu
#include <iostream>
#ifndef DIM
#define DIM 16
#endif
typedef int mt;

template <typename T>
__global__ void sxs(const T * __restrict__ s1, const T * __restrict__ s2, T * dest, const size_t width, const size_t height){

  size_t sidx = threadIdx.x+blockDim.x*blockIdx.x;
  while (sidx < width*height){
    size_t mydiv = sidx/width;
    size_t mytrunc = mydiv*width;
    size_t didx = mytrunc + sidx;
    size_t didx2 = didx + width;
    dest[didx]  = s1[sidx];
    dest[didx2] = s2[sidx];
    sidx += gridDim.x*blockDim.x;} // grid-stride loop
}

const size_t w = DIM;
const size_t h = DIM;

int main(){

  // data setup
  mt *h_d1, *h_d2, *h_o, *d_d1, *d_d2, *d_o;
  h_d1 = new mt[w*h];
  h_d2 = new mt[w*h];
  h_o  = new mt[w*h*2];
  cudaMalloc(&d_d1,  w*h*sizeof(mt));
  cudaMalloc(&d_d2,  w*h*sizeof(mt));
  cudaMalloc(&d_o, 2*w*h*sizeof(mt));
  for (int i = 0; i < w*h; i++){
    h_d1[i] = 1;
    h_d2[i] = 2;}
  cudaMemcpy(d_d1, h_d1, w*h*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_d2, h_d2, w*h*sizeof(mt), cudaMemcpyHostToDevice);
  // method 1: kernel
  sxs<<<(w*h+511)/512, 512>>>(d_d1, d_d2, d_o, w, h);
  cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
  if (w == 16){
    std::cout << "kernel:" << std::endl;
    int cnt = 0;
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
      std::cout << std::endl;}
    }
  // method 2: cudaMemcpy2D
  cudaMemcpy2D(d_o,   2*w*sizeof(mt), d_d1, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
  cudaMemcpy2D(d_o+w, 2*w*sizeof(mt), d_d2, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
  cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
  if (w == 16){
    std::cout << "cudaMemcpy2D" << std::endl;
    int cnt = 0;
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
      std::cout << std::endl;}
    }
  return 0;
}
$ nvcc -arch=sm_60 -o t346 t346.cu -DDIM=1024
$ nvprof ./t346
==6141== NVPROF is profiling process 6141, command: ./t346
==6141== Profiling application: ./t346
==6141== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   72.94%  5.1450ms         2  2.5725ms  1.9829ms  3.1622ms  [CUDA memcpy DtoH]
                   26.07%  1.8388ms         2  919.42us  915.32us  923.51us  [CUDA memcpy HtoD]
                    0.52%  36.352us         2  18.176us  18.048us  18.304us  [CUDA memcpy DtoD]
                    0.48%  33.728us         1  33.728us  33.728us  33.728us  void sxs<int>(int const *, int const *, int*, unsigned long, unsigned long)
      API calls:   95.63%  353.56ms         3  117.85ms  277.75us  353.00ms  cudaMalloc
                    2.49%  9.1907ms         4  2.2977ms  1.1484ms  4.2988ms  cudaMemcpy
                    1.31%  4.8520ms       384  12.635us     382ns  523.01us  cuDeviceGetAttribute
                    0.40%  1.4867ms         4  371.67us  240.82us  569.00us  cuDeviceTotalMem
                    0.12%  449.25us         4  112.31us  99.344us  139.12us  cuDeviceGetName
                    0.02%  79.583us         2  39.791us  17.312us  62.271us  cudaMemcpy2D
                    0.02%  57.212us         1  57.212us  57.212us  57.212us  cudaLaunchKernel
                    0.01%  24.571us         4  6.1420us  4.2080us  9.2350us  cuDeviceGetPCIBusId
                    0.00%  9.7550us         8  1.2190us     480ns  2.8420us  cuDeviceGet
                    0.00%  6.2190us         3  2.0730us     380ns  3.5220us  cuDeviceGetCount
                    0.00%  2.3150us         4     578ns     515ns     720ns  cuDeviceGetUuid
$

CUDA中的并排阵列

1 个答案: