Question

首先发布在这里。我目前正在开发一个项目，需要将一个大的2d数组（大约1,000,000x7）写入我的GPU，进行一些计算，然后将其返回给主机。由于我想快速完成并使用如此大的数组，我试图将数组展平以帮助将其简单地传递到GPU中。数组成功写入（或者至少cudaMalloc和cudaMemcpy在我写入设备时都返回cudaSuccess），但是当我尝试读取它时，cudaMemcpy返回一个无效的参数错误。

我无法弄清楚为什么会这样，因为我认为我应该在设备上写一个有效的1d数组（扁平）并将其读回来，我认为我正在提供正确的参数来做这个。我在网上找到的这个错误的唯一结果涉及交换cudaMemcpy的dst和src参数，但我想我已经有了这些。

这是我的代码的简化版本，可以重现问题：

#include <iostream>

using namespace std;

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){ 

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

void write2dArrayToGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

void read2dArrayFromGPU(float ** arr, float * devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, Darr, R, C);
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, Darr, R, C);


}

我使用

在我的笔记本电脑上编译并运行它

 $nvcc -arch=sm_30 test.cu -o test
 $optirun cuda-memcheck ./test

并得到结果：

========= CUDA-MEMCHECK
Allocating an 100x7 array ...Assigning some values ...Done!
Writing to the GPU ... Done!
Allocating second 100x7 array ...Done!
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
Reading from the GPU into the new array ...=========     Host Frame:/usr/lib64/nvidia-bumblebee/libcuda.so.1 [0x2ef343]
cudaMemcpy Read Failed=========     Host Frame:./test [0x38c6f]
=========     Host Frame:./test [0x2f08]
=========     Host Frame:./test [0x3135]
=========     Host Frame:/usr/lib64/libc.so.6 (__libc_start_main + 0xf1) [0x20401]
=========     Host Frame:./test [0x2c6a]

=========
========= ERROR SUMMARY: 1 error

我是CUDA的新手，还在学习，所以任何帮助都会受到赞赏，谢谢！

Answer 1

感谢Robert Crovella指出我正确的方向，并在上面发表评论，并链接了类似的question。

要点是通过将devPtr按值传递而不是通过指针或引用传递到我的GPU write和read函数中，cudaMalloc和cudaMemcpy函数仅作用于副本在功能范围内。

两个解决方案 - （这两个解决方案都没有给我带来错误）

首先：将devPtr通过引用传递到write2dArrayToGPU和read2dArrayFromGPU解决方案然后看起来像。

#include <iostream>

using namespace std;


void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

//changed float * devPtr to float *  &devPtr
void write2dArrayToGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)&devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

//changed float * devPtr to float * &devPtr
void read2dArrayFromGPU(float ** arr, float * &devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, Darr, R, C);
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, Darr, R, C);


}

第二：通过指针传递devPtr，使解决方案看起来像

#include <iostream>

using namespace std;

void alloc2dArray(float ** &arr, unsigned long int rows, unsigned long int cols){

    arr = new float*[rows];

    arr[0] = new float[rows * cols];

    for(unsigned long int i = 1; i < rows; i++) arr[i] = arr[i - 1] + cols;
}

//changed float * devPtr to float ** devPtr
void write2dArrayToGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMalloc((void**)devPtr, sizeof(float) * rows * cols)) cerr << "cudaMalloc Failed";

    if(cudaSuccess != cudaMemcpy(*devPtr, arr[0], sizeof(float) * rows * cols, cudaMemcpyHostToDevice)) cerr << "cudaMemcpy Write Failed";
}

//changed float * devPtr to float ** devPtr
void read2dArrayFromGPU(float ** arr, float ** devPtr, unsigned long int rows, unsigned long int cols){

    if(cudaSuccess != cudaMemcpy(arr[0], *devPtr, sizeof(float) * rows * cols, cudaMemcpyDeviceToHost)) cerr << "cudaMemcpy Read Failed" << endl;
}

int main(){

int R = 100;
int C = 7;

cout << "Allocating an " << R << "x" << C << " array ...";
float ** arrA;
alloc2dArray(arrA, R, C);


cout << "Assigning some values ...";
for(int i = 0; i < R; i++){
    for(int j = 0; j < C; j++){
        arrA[i][j] = i*C + j;
    }
}
cout << "Done!" << endl;


cout << "Writing to the GPU ...";
float * Darr = 0;
write2dArrayToGPU(arrA, &Darr, R, C); \\changed Darr to &Darr
cout << " Done!" << endl;

cout << "Allocating second " << R << "x" << C << " array ...";
float ** arrB;
alloc2dArray(arrB, R, C);
cout << "Done!" << endl;

cout << "Reading from the GPU into the new array ...";
read2dArrayFromGPU(arrB, &Darr, R, C); // changed Darr to &Darr


}

cudaMemcpy在从Device to Host读取时返回cudaErrorInvalidArgument，不清楚原因

1 个答案: