我有两个uint8_t图像缓冲区,想将它们并排放置在cuda内存中,如下所示:
--------------------- ---------------------
| | | |
| | | |
| 1 | | 2 |
| | | |
| | | |
--------------------- ---------------------
使用cudaMemcpy不起作用,因为它会线性填充,而第二个缓冲区会覆盖第一个缓冲区。
cudaArray_t似乎更适合浮点数,但是上面没有很多文档。
任何帮助将不胜感激!谢谢。
答案 0 :(得分:1)
这里比较了2种不同的方法。第一种方法使用内核将两个单独的缓冲区“并排”放置在设备内存中,即行交错。
第二种方法使用两个cudaMemcpy2D
调用来执行相同的操作:
$ cat t346.cu
#include <iostream>
#ifndef DIM
#define DIM 16
#endif
typedef int mt;
template <typename T>
__global__ void sxs(const T * __restrict__ s1, const T * __restrict__ s2, T * dest, size_t width, size_t height){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
size_t sidx = idx;
while (sidx < width*height){
size_t mydiv = sidx/width;
size_t mymod = sidx - (mydiv*width);
size_t didx = ((mydiv) * 2)*width + mymod;
size_t didx2 = didx + width;
dest[didx] = s1[sidx];
dest[didx2] = s2[sidx];
sidx += gridDim.x*blockDim.x;} // grid-stride loop
}
const size_t w = DIM;
const size_t h = DIM;
int main(){
// data setup
mt *h_d1, *h_d2, *h_o, *d_d1, *d_d2, *d_o;
h_d1 = new mt[w*h];
h_d2 = new mt[w*h];
h_o = new mt[w*h*2];
cudaMalloc(&d_d1, w*h*sizeof(mt));
cudaMalloc(&d_d2, w*h*sizeof(mt));
cudaMalloc(&d_o, 2*w*h*sizeof(mt));
for (int i = 0; i < w*h; i++){
h_d1[i] = 1;
h_d2[i] = 2;}
cudaMemcpy(d_d1, h_d1, w*h*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_d2, h_d2, w*h*sizeof(mt), cudaMemcpyHostToDevice);
// method 1: kernel
sxs<<<(w*h+511)/512, 512>>>(d_d1, d_d2, d_o, w, h);
cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
if (w == 16){
std::cout << "kernel:" << std::endl;
int cnt = 0;
for (int i = 0; i < 16; i++){
for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
std::cout << std::endl;}
}
// method 2: cudaMemcpy2D
cudaMemcpy2D(d_o, 2*w*sizeof(mt), d_d1, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
cudaMemcpy2D(d_o+w, 2*w*sizeof(mt), d_d2, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
if (w == 16){
std::cout << "cudaMemcpy2D" << std::endl;
int cnt = 0;
for (int i = 0; i < 16; i++){
for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
std::cout << std::endl;}
}
return 0;
}
$ nvcc -o t346 t346.cu
$ cuda-memcheck ./t346
========= CUDA-MEMCHECK
kernel:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
cudaMemcpy2D
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
========= ERROR SUMMARY: 0 errors
$ nvcc -o t346 t346.cu -DDIM=1024
$ nvprof ./t346
==7903== NVPROF is profiling process 7903, command: ./t346
==7903== Profiling application: ./t346
==7903== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 63.26% 5.6010ms 2 2.8005ms 2.0116ms 3.5894ms [CUDA memcpy DtoH]
35.88% 3.1773ms 2 1.5887ms 1.5809ms 1.5965ms [CUDA memcpy HtoD]
0.45% 39.679us 1 39.679us 39.679us 39.679us void sxs<int>(int const *, int const *, int*, unsigned long, unsigned long)
0.41% 36.224us 2 18.112us 18.080us 18.144us [CUDA memcpy DtoD]
API calls: 94.95% 356.93ms 3 118.98ms 290.75us 356.33ms cudaMalloc
2.96% 11.121ms 4 2.7802ms 2.0230ms 4.4443ms cudaMemcpy
1.49% 5.6179ms 384 14.629us 406ns 969.76us cuDeviceGetAttribute
0.43% 1.6087ms 4 402.18us 250.54us 615.60us cuDeviceTotalMem
0.12% 462.90us 4 115.73us 105.58us 134.35us cuDeviceGetName
0.02% 82.153us 2 41.076us 24.136us 58.017us cudaMemcpy2D
0.02% 60.048us 1 60.048us 60.048us 60.048us cudaLaunchKernel
0.01% 24.121us 4 6.0300us 4.1910us 8.5880us cuDeviceGetPCIBusId
0.00% 10.201us 8 1.2750us 534ns 2.7570us cuDeviceGet
0.00% 6.6820us 3 2.2270us 368ns 3.8570us cuDeviceGetCount
0.00% 2.8140us 4 703ns 583ns 844ns cuDeviceGetUuid
$
我们可以看到,在上述测试案例中,当图像为1024x1024时,内核方法使用大约40微秒,而两个cudamemcpy2D
操作合起来使用大约80微秒。
从获得的带宽角度来看,内核正在移动2 * 1024 * 1024 * sizeof(int)字节(读取和写入每个字节)。读取为8MB,写入为8MB,即40us总共16MB = 400,000 MB / s或400GB / s达到的带宽。正如bandwidthTest
所示,这恰好在具有约500GB / s带宽的Tesla P100 GPU上。因此,根据此度量,该内核可达到峰值可用带宽的80%。
这个稍有改进的内核版本运行时间大约为34微秒,而不是40秒,在34us = 470GB / s的速度下可生成16MB:
$ cat t346.cu
#include <iostream>
#ifndef DIM
#define DIM 16
#endif
typedef int mt;
template <typename T>
__global__ void sxs(const T * __restrict__ s1, const T * __restrict__ s2, T * dest, const size_t width, const size_t height){
size_t sidx = threadIdx.x+blockDim.x*blockIdx.x;
while (sidx < width*height){
size_t mydiv = sidx/width;
size_t mytrunc = mydiv*width;
size_t didx = mytrunc + sidx;
size_t didx2 = didx + width;
dest[didx] = s1[sidx];
dest[didx2] = s2[sidx];
sidx += gridDim.x*blockDim.x;} // grid-stride loop
}
const size_t w = DIM;
const size_t h = DIM;
int main(){
// data setup
mt *h_d1, *h_d2, *h_o, *d_d1, *d_d2, *d_o;
h_d1 = new mt[w*h];
h_d2 = new mt[w*h];
h_o = new mt[w*h*2];
cudaMalloc(&d_d1, w*h*sizeof(mt));
cudaMalloc(&d_d2, w*h*sizeof(mt));
cudaMalloc(&d_o, 2*w*h*sizeof(mt));
for (int i = 0; i < w*h; i++){
h_d1[i] = 1;
h_d2[i] = 2;}
cudaMemcpy(d_d1, h_d1, w*h*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_d2, h_d2, w*h*sizeof(mt), cudaMemcpyHostToDevice);
// method 1: kernel
sxs<<<(w*h+511)/512, 512>>>(d_d1, d_d2, d_o, w, h);
cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
if (w == 16){
std::cout << "kernel:" << std::endl;
int cnt = 0;
for (int i = 0; i < 16; i++){
for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
std::cout << std::endl;}
}
// method 2: cudaMemcpy2D
cudaMemcpy2D(d_o, 2*w*sizeof(mt), d_d1, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
cudaMemcpy2D(d_o+w, 2*w*sizeof(mt), d_d2, w*sizeof(mt), w*sizeof(mt), h, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_o, d_o, 2*w*h*sizeof(mt), cudaMemcpyDeviceToHost);
if (w == 16){
std::cout << "cudaMemcpy2D" << std::endl;
int cnt = 0;
for (int i = 0; i < 16; i++){
for (int j = 0; j < 32; j++) std::cout << h_o[cnt++] << " ";
std::cout << std::endl;}
}
return 0;
}
$ nvcc -arch=sm_60 -o t346 t346.cu -DDIM=1024
$ nvprof ./t346
==6141== NVPROF is profiling process 6141, command: ./t346
==6141== Profiling application: ./t346
==6141== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 72.94% 5.1450ms 2 2.5725ms 1.9829ms 3.1622ms [CUDA memcpy DtoH]
26.07% 1.8388ms 2 919.42us 915.32us 923.51us [CUDA memcpy HtoD]
0.52% 36.352us 2 18.176us 18.048us 18.304us [CUDA memcpy DtoD]
0.48% 33.728us 1 33.728us 33.728us 33.728us void sxs<int>(int const *, int const *, int*, unsigned long, unsigned long)
API calls: 95.63% 353.56ms 3 117.85ms 277.75us 353.00ms cudaMalloc
2.49% 9.1907ms 4 2.2977ms 1.1484ms 4.2988ms cudaMemcpy
1.31% 4.8520ms 384 12.635us 382ns 523.01us cuDeviceGetAttribute
0.40% 1.4867ms 4 371.67us 240.82us 569.00us cuDeviceTotalMem
0.12% 449.25us 4 112.31us 99.344us 139.12us cuDeviceGetName
0.02% 79.583us 2 39.791us 17.312us 62.271us cudaMemcpy2D
0.02% 57.212us 1 57.212us 57.212us 57.212us cudaLaunchKernel
0.01% 24.571us 4 6.1420us 4.2080us 9.2350us cuDeviceGetPCIBusId
0.00% 9.7550us 8 1.2190us 480ns 2.8420us cuDeviceGet
0.00% 6.2190us 3 2.0730us 380ns 3.5220us cuDeviceGetCount
0.00% 2.3150us 4 578ns 515ns 720ns cuDeviceGetUuid
$