我正在测试一个代码,其中内核意味着在两个指针中存储两个值之间执行简单求和。
在调用内核“add”之后,即使没有对内核中的指针执行任何操作,我也无法再将指针的数据从主机复制到设备并从那里复制到主机。但是当我评论调用函数的语句时,我得到了正确的结果。这是代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void add(int *a, int *b, int *c)
{
*c = *a - *b;
}
int main(void)
{
int result, x_val, y_val; //Store data from device to host in this vars.
int *x_host, *y_host; //Pointers in host
int *tempGPU, *x_dev, *y_dev; //Pointers in device
x_host = (int *)malloc(sizeof(int));
y_host = (int *)malloc(sizeof(int));
*x_host = 8;
*y_host = 4;
x_val = -5;
y_val = -10;
printf("\n x = %d, y = %d\n", *x_host, *y_host);
cudaMalloc( (void **)&tempGPU, sizeof(int) );
//It's wrong to pass this arguments to the function. The problem is in this statement.
add<<<1,1>>> (x_host, y_host, tempGPU);
cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost);
printf("\n x_host - y_host = %d\n", result);
cudaMalloc( (void **)&x_dev, sizeof(int) );
cudaMalloc( (void **)&y_dev, sizeof(int) );
*x_host = 6;
*y_host = 20;
cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&x_val, x_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&y_val, y_dev, sizeof(int), cudaMemcpyDeviceToHost);
printf("\n x_host = %d, y_host = %d\n", *x_host, *y_host);
printf("\n x_val = %d, y_val = %d\n", x_val, y_val);
cudaFree( tempGPU );
printf( "\nCUDA: %s\n", cudaGetErrorString(cudaGetLastError()) );
return 0;
}
我知道该函数期望在设备中分配指针,但为什么这样的错误不允许我正确使用cudaMemcpy?为什么我评论这一行:
add<<<1,1>>> (x_host, y_host, tempGPU);
我得到了正确的结果。感谢。
答案 0 :(得分:2)
您的问题是x_host
和y_host
是指向主机内存空间的指针。 __global__ add
函数需要指向设备内存空间的指针。在构建代码时,add
会错误地将x_host
和y_host
解释为设备内存指针。
正如Farzad所注意到的那样,你可以通过What is the canonical way to check for errors using the CUDA runtime API?意义上的适当CUDA错误检查来发现错误。
以下是使用正确的CUDA错误检查修复的代码。
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); getchar(); }
}
}
__global__ void add(int *a, int *b, int *c)
{
*c = *a - *b;
}
int main(void)
{
int* x_host = (int*)malloc(sizeof(int));
int* y_host = (int*)malloc(sizeof(int));
*x_host = 8;
*y_host = 4;
int* tempGPU; gpuErrchk(cudaMalloc((void**)&tempGPU,sizeof(int)));
int* x_dev; gpuErrchk(cudaMalloc((void**)&x_dev, sizeof(int)));
int* y_dev; gpuErrchk(cudaMalloc((void**)&y_dev, sizeof(int)));
gpuErrchk(cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice));
int result;
add<<<1,1>>> (x_dev, y_dev, tempGPU);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost));
printf("\n x_host - y_host = %d\n", result);
gpuErrchk(cudaFree(x_dev));
gpuErrchk(cudaFree(y_dev));
gpuErrchk(cudaFree(tempGPU));
getchar();
return 0;
}