我有两个方形矩阵:d_img和d_template。我正在尝试使用cudaMemcpy2D()将d_img的区域(在这种情况下从左上角)复制到d_template。但它并没有复制正确的范围。我在这方面发现的大多数问题都是因为论证是错误的。但我很确定我有这个权利。我已经检查过,并且没有发生任何错误。
int const TEMPLATE_DIM = 10;
int const OFFSET_DIM = 1;
int const IMG_DIM = 2 * OFFSET_DIM + TEMPLATE_DIM; //12
size_t const TEMPLATE_DIM_BYTES = TEMPLATE_DIM * sizeof(int);
size_t const IMG_DIM_BYTES = IMG_DIM * sizeof(int);
int main(){
//Larger matrix
int h_img[IMG_DIM][IMG_DIM];
int* d_img;
size_t imgPitch;
cudaMallocPitch(&d_img, &imgPitch, IMG_DIM_BYTES, IMG_DIM);
//Subset matrix
int h_template[TEMPLATE_DIM][TEMPLATE_DIM];
int* d_template;
size_t templatePitch;
cudaMallocPitch(&d_template, &templatePitch, TEMPLATE_DIM_BYTES, TEMPLATE_DIM);
//populate h_img, copy to d_img
srand(time(NULL)+1);
for (int y = 0; y < IMG_DIM; ++y)
for (int x = 0; x < IMG_DIM; ++x)
h_img[y][x] = y*IMG_DIM+x;
cout << "h_img: \n"; printTemplateImg(h_img);
cudaMemcpy(d_img, h_img, IMG_DIM_BYTES*IMG_DIM, cudaMemcpyHostToDevice);
//copy subset of d_img to d_template
cudaMemcpy2D(d_template, templatePitch, d_img, imgPitch, TEMPLATE_DIM_BYTES, TEMPLATE_DIM, cudaMemcpyDeviceToDevice);
//copy d_template to h_template to view it.
cudaMemcpy(h_template, d_template, TEMPLATE_DIM_BYTES*TEMPLATE_DIM, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cout << "h_template: \n"; printTemplate(h_template);
}
这是输出
h_img:
{
{0,1,2,3,4,5,6,7,8,9,10,11,}
{12,13,14,15,16,17,18,19,20,21,22,23,}
{24,25,26,27,28,29,30,31,32,33,34,35,}
{36,37,38,39,40,41,42,43,44,45,46,47,}
{48,49,50,51,52,53,54,55,56,57,58,59,}
{60,61,62,63,64,65,66,67,68,69,70,71,}
{72,73,74,75,76,77,78,79,80,81,82,83,}
{84,85,86,87,88,89,90,91,92,93,94,95,}
{96,97,98,99,100,101,102,103,104,105,106,107,}
{108,109,110,111,112,113,114,115,116,117,118,119,}
{120,121,122,123,124,125,126,127,128,129,130,131,}
{132,133,134,135,136,137,138,139,140,141,142,143,}
}
h_template:
{
{0,1,2,3,4,5,6,7,8,9,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
{0,0,0,0,0,0,0,0,0,0,}
}
为什么只做第一行? 此外,如果您将TEMPLATE_DIM更改为32或96,它会提供奇怪的行跳过模式,以防您看到我错过的模式。
答案 0 :(得分:2)
由于您选择使用的设备分配是使用cudaMallocPitch
分配的倾斜线性内存,因此您需要使用cudaMemcpy2D
与设备进行传输。因为它正在使用cudaMemcpy
并使用设备上的斜线性存储器的错误区域进行传输。
如果您将操作顺序更改为以下内容:
//populate h_img, copy to d_img
// ...
cudaMemcpy2D(d_img, imgPitch, h_img, IMG_DIM_BYTES, IMG_DIM_BYTES, IMG_DIM, cudaMemcpyHostToDevice);
//copy subset of d_img to d_template
cudaMemcpy2D(d_template, templatePitch, d_img, imgPitch, TEMPLATE_DIM_BYTES, TEMPLATE_DIM, cudaMemcpyDeviceToDevice);
//copy d_template to h_template to view it.
cudaMemcpy2D(h_template, TEMPLATE_DIM_BYTES, d_template, templatePitch, TEMPLATE_DIM_BYTES, TEMPLATE_DIM, cudaMemcpyDeviceToHost);
你应该发现代码按预期工作。