我是CUDA并行编程的初学者。我尝试了一个使用CUDA流添加矢量的程序。当我编译时,我得到以下错误。
解决方案不正确。 解决方案与第0行的预期结果不匹配。期望(1 + 0.5 = 1.5)但得到0。
我在网上通过示例书和类似问题与cuda核实过。找不到解决方案。任何人都可以帮我解决这个错误吗?提前谢谢。
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)
__global__ void vecAdd(float * in1, float * in2, float * out, int len)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < len)
out[i] = in1[i] + in2[i];
}
int main(int argc, char ** argv)
{
cudaStream_t stream0, stream1,stream2,stream3;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
wbArg_t args;
int inputLength;
float *h_A, *h_B, *h_C;
float *d_A0, *d_B0, *d_C0;
float *d_A1, *d_B1, *d_C1;
float *d_A2, *d_B2, *d_C2;
float *d_A3, *d_B3, *d_C3;
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
h_C = (float *) malloc(inputLength * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The input length is ", inputLength);
wbLog(TRACE, "h_A ", *h_A);
wbLog(TRACE, "h_B", *h_B);
int size = inputLength * sizeof(float);
int SegSize = inputLength/4;
wbCheck(cudaMalloc((void **) &d_A0, size));
wbCheck(cudaMalloc((void **) &d_B0, size));
wbCheck(cudaMalloc((void **) &d_C0, size));
wbCheck(cudaMalloc((void **) &d_A1, size));
wbCheck(cudaMalloc((void **) &d_B1, size));
wbCheck(cudaMalloc((void **) &d_C1, size));
wbCheck(cudaMalloc((void **) &d_A2, size));
wbCheck(cudaMalloc((void **) &d_B2, size));
wbCheck(cudaMalloc((void **) &d_C2, size));
wbCheck(cudaMalloc((void **) &d_A3, size));
wbCheck(cudaMalloc((void **) &d_B3, size));
wbCheck(cudaMalloc((void **) &d_C3, size));
cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault);
dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1);
dim3 DimBlock(256 , 1, 1);
for (int i=0; i<size; i+=inputLength*4)
{
cudaMemcpyAsync(d_A0, h_A+i, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_B0, h_B+i, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_A1, h_A+i+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
cudaMemcpyAsync(d_A2, h_A+i+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(d_B2, h_B+i+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(d_A3, h_A+i+SegSize+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream3);
cudaMemcpyAsync(d_B3, h_B+i+SegSize+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream3);
vecAdd<<<DimGrid, DimBlock, 0, stream0>>>(d_A0, d_B0, d_C0,inputLength);
vecAdd<<<DimGrid, DimBlock, 0, stream1>>>(d_A1, d_B1, d_C1,inputLength);
vecAdd<<<DimGrid, DimBlock, 0, stream2>>>(d_A2, d_B2, d_C2,inputLength);
vecAdd<<<DimGrid, DimBlock, 0, stream3>>>(d_A3, d_B3, d_C3,inputLength);
cudaDeviceSynchronize();
cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(h_C+i+SegSize, d_C1, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream1);
cudaMemcpyAsync(h_C+i+SegSize+SegSize, d_C2, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream2);
cudaMemcpyAsync(h_C+i+SegSize+SegSize+SegSize, d_C3, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream3);
wbLog(TRACE, "on addition is ", *h_C);
}
cudaFree(d_A0);
cudaFree(d_B0);
cudaFree(d_C0);
cudaFree(d_A1);
cudaFree(d_B1);
cudaFree(d_C1);
cudaFree(d_A2);
cudaFree(d_B2);
cudaFree(d_C2);
cudaFree(d_A3);
cudaFree(d_B3);
cudaFree(d_C3);
wbSolution(args, h_C, inputLength);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
return 0;
}
答案 0 :(得分:1)
一个问题是您如何处理h_A
,h_B
和h_C
:
h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
以上代码行正在为h_A
和h_B
创建分配并导入一些数据(可能是)。
这些代码行:
cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault);
不做你的想法。他们正在为h_A
,h_B
和h_C
创建新分配。无论以前引用的那些指针是什么数据都不再可以从这些指针访问(即,对于所有意图和目的,它都会丢失)。
CUDA应该可以正常使用这里创建的指针和分配:
h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
h_C = (float *) malloc(inputLength * sizeof(float));
所以删除这些代码行:
cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault);
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault);
并删除这些:
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
你应该更接近解决方案。