Question

我正在测试一个代码，其中内核意味着在两个指针中存储两个值之间执行简单求和。

在调用内核“add”之后，即使没有对内核中的指针执行任何操作，我也无法再将指针的数据从主机复制到设备并从那里复制到主机。但是当我评论调用函数的语句时，我得到了正确的结果。这是代码：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void add(int *a, int *b, int *c)
{
*c = *a - *b;
}

int main(void)
{
int result, x_val, y_val; //Store data from device to host in this vars.
int *x_host, *y_host; //Pointers in host
int *tempGPU, *x_dev, *y_dev; //Pointers in device

x_host = (int *)malloc(sizeof(int));
y_host = (int *)malloc(sizeof(int));

*x_host = 8;
*y_host = 4;

x_val = -5;
y_val = -10;

printf("\n x = %d, y = %d\n", *x_host, *y_host);

cudaMalloc( (void **)&tempGPU, sizeof(int) );

//It's wrong to pass this arguments to the function. The problem is in this statement.
add<<<1,1>>> (x_host, y_host, tempGPU);

cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost);

printf("\n x_host - y_host = %d\n", result);

cudaMalloc( (void **)&x_dev, sizeof(int) );
cudaMalloc( (void **)&y_dev, sizeof(int) );

*x_host = 6;
*y_host = 20;

cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(&x_val, x_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&y_val, y_dev, sizeof(int), cudaMemcpyDeviceToHost);

printf("\n x_host = %d, y_host = %d\n", *x_host, *y_host);
printf("\n x_val = %d, y_val = %d\n", x_val, y_val);

cudaFree( tempGPU );

printf( "\nCUDA: %s\n", cudaGetErrorString(cudaGetLastError()) );

return 0;

}

我知道该函数期望在设备中分配指针，但为什么这样的错误不允许我正确使用cudaMemcpy？为什么我评论这一行：

add<<<1,1>>> (x_host, y_host, tempGPU);

我得到了正确的结果。感谢。

Answer 1

您的问题是x_host和y_host是指向主机内存空间的指针。 __global__ add函数需要指向设备内存空间的指针。在构建代码时，add会错误地将x_host和y_host解释为设备内存指针。

正如Farzad所注意到的那样，你可以通过What is the canonical way to check for errors using the CUDA runtime API?意义上的适当CUDA错误检查来发现错误。

以下是使用正确的CUDA错误检查修复的代码。

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); getchar(); }
    }
}

__global__ void add(int *a, int *b, int *c)
{
    *c = *a - *b;
}

int main(void)
{
    int* x_host = (int*)malloc(sizeof(int));
    int* y_host = (int*)malloc(sizeof(int));

    *x_host = 8;
    *y_host = 4;

    int* tempGPU;   gpuErrchk(cudaMalloc((void**)&tempGPU,sizeof(int)));
    int* x_dev;     gpuErrchk(cudaMalloc((void**)&x_dev,  sizeof(int)));
    int* y_dev;     gpuErrchk(cudaMalloc((void**)&y_dev,  sizeof(int)));

    gpuErrchk(cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice));

    int result; 

    add<<<1,1>>> (x_dev, y_dev, tempGPU);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost));

    printf("\n x_host - y_host = %d\n", result);

    gpuErrchk(cudaFree(x_dev));
    gpuErrchk(cudaFree(y_dev));
    gpuErrchk(cudaFree(tempGPU));

    getchar();

    return 0;

}

在对函数进行“错误”调用后，CUDA无法再将数据从设备复制到主机

1 个答案: