Question

我目前在CUDA编程方面遇到了特殊困难 - 更具体地说，是复制和读取设备发送回主机的阵列。当我试图读取我应该返回给我的数据时，我得到的只是垃圾数据。任何人都可以看看我的代码片段并告诉我我做错了什么？非常感谢你！

struct intss {
u_int32_t one;
u_int32_t two;
};



int main()
{
    int block_size = 3;             
    int grid_size = 1;

    intss *device_fb = 0;
    intss *host_fb = 0;


    int num_bytes_fb = (block_size*grid_size)*sizeof(intss);


host_fb = (intss*)malloc(num_bytes_fb); 
cudaMalloc((void **)&device_fb, num_bytes_fb);

    ....

    render2<<<block_size,grid_size>>>(device_fb, device_pixelspercore, samples, obj_list_flat_dev, numOpsPerCore, lnumdev, camdev, lightsdev, uranddev, iranddev);


    ....

   cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost);


   printf("output %d ", host_fb[0].one);

   printf("output %d ", host_fb[1].one);

   printf("output %d ", host_fb[2].one);   
   //Note that I'm only looking at elements the 3 elements 0-2 from host_fb. I am   doing this because block_size*grid_size = 3. Is this wrong?

    cudaFree(device_fb);
    free(host_fb);
}


__global__ void render2(intss *device_fb, struct parallelPixels *pixelsPerCore, int     samples, double *obj_list_flat_dev, int numOpsPerCore, int lnumdev, struct camera camdev, struct vec3 *lightsdev, struct vec3 *uranddev, int *iranddev)            //SPECIFY ARGUMENTS!!!
{
int index = blockIdx.x * blockDim.x + threadIdx.x; //DETERMINING INDEX BASED ON WHICH THREAD IS CURRENTLY RUNNING

....

//computing data...


device_fb[index].one = (((u_int32_t)(MIN(r, 1.0) * 255.0) & 0xff) << RSHIFT |   
                  ((u_int32_t)(MIN(g, 1.0) * 255.0) & 0xff) << GSHIFT |
                  ((u_int32_t)(MIN(b, 1.0) * 255.0) & 0xff) << BSHIFT);
}

编辑：

感谢您的建议，我在我的程序中实现了CudaErrorCheck函数，似乎有一种函数给我错误的模式。

在我的程序中，我有一堆全局主机数组（obj_list，lights，urand，irand）。每当我尝试使用cudaMemCpy将这些主机阵列复制到设备阵列时，我收到以下错误：＆＃34;文件中的Cuda错误＆＃39; cudatrace.cu＆＃39;在第x行：无效的参数。＆＃34;

obj_list和灯在以下函数load_scene（）中填充：

void load_scene（FILE * fp）{ char line [256]，* ptr，type;

obj_list = (sphere *)malloc(sizeof(struct sphere));
obj_list->next = 0;
objCounter = 0;

while((ptr = fgets(line, 256, fp))) {
    int i;
    struct vec3 pos, col;
    double rad, spow, refl;

    while(*ptr == ' ' || *ptr == '\t') ptr++;
    if(*ptr == '#' || *ptr == '\n') continue;

    if(!(ptr = strtok(line, DELIM))) continue;
    type = *ptr;

    for(i=0; i<3; i++) {
        if(!(ptr = strtok(0, DELIM))) break;
        *((double*)&pos.x + i) = atof(ptr);
    }

    if(type == 'l') {
        lights[lnum++] = pos;
        continue;
    }

    if(!(ptr = strtok(0, DELIM))) continue;
    rad = atof(ptr);

    for(i=0; i<3; i++) {
        if(!(ptr = strtok(0, DELIM))) break;
        *((double*)&col.x + i) = atof(ptr);
    }

    if(type == 'c') {
        cam.pos = pos;
        cam.targ = col;
        cam.fov = rad;
        continue;
    }

    if(!(ptr = strtok(0, DELIM))) continue;
    spow = atof(ptr);

    if(!(ptr = strtok(0, DELIM))) continue;
    refl = atof(ptr);

    if(type == 's') { 
        objCounter++;
        struct sphere *sph = (sphere *)malloc(sizeof(*sph));
        sph->next = obj_list->next;
        obj_list->next = sph;

        sph->pos = pos;
        sph->rad = rad;
        sph->mat.col = col;
        sph->mat.spow = spow;
        sph->mat.refl = refl;

    } else {
        fprintf(stderr, "unknown type: %c\n", type);
    }
}

}

urand和irand填写如下：

/* initialize the random number tables for the jitter */
for(i=0; i<NRAN; i++) urand[i].x = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) urand[i].y = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) irand[i] = (int)(NRAN * ((double)rand() / RAND_MAX));

我不认为无效参数可能是由设备数组引起的，因为在cudaMemcpy调用之前创建设备数组的cudaMalloc调用没有CudaError消息。例如，在以下代码行中：

cudaErrorCheck(cudaMalloc((void **)&lightsdev, MAX_LIGHTS*sizeof(struct vec3)) );

cudaErrorCheck( cudaMemcpy(&lightsdev, &lights, sizeof(struct vec3) * MAX_LIGHTS, cudaMemcpyHostToDevice) );

cudaMalloc没有产生错误，但cudaMemcpy没有。

如果我没有提供有关我的代码的足够信息，我已将整个代码粘贴到：http://pastebin.com/UgzABPgH

（请注意，在pastebin版本中，我在产生错误的CudaMemcpy上取出了CudaErrorCheck函数。）

非常感谢！

编辑：实际上，我只是想看看如果urand和irand不是全局的话会发生什么，以及它们是否与设备数组uranddev和iranddev一起初始化。我仍然得到相同的＆＃34;无效的论点＆＃34;错误，因此变量是否为全局变量必须与问题无关。

Answer 1

当您发布不完整，无法编译的代码而没有正确描述实际问题时，绝对不可能说出任何内容。通过在StackOverflow上提出更好的问题，您将得到更好的答案。

说完了。最可能的问题不是数据没有被复制到设备或从设备复制，而是内核本身没有运行。每个CUDA运行时API调用都会返回一个状态代码，您应该检查所有这些代码。您可以像这样定义一个错误检查宏：

#include <stdio.h>

#define cudaErrorCheck(call) { cudaAssert(call,__FILE__,__LINE__) }

void cudaAssert(const cudaError err, const char *file, const int line)
{ 
    if( cudaSuccess != err) {                                                
        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        
                file, line, cudaGetErrorString(err) );
        exit(1);
    } 
}

并将每个API调用包装在其中，如下所示：

cudaErrorCheck( cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost) );

对于内核启动，您可以检查启动失败或运行时错误，如下所示：

kernel<<<....>>>();
cudaErrorCheck( cudaPeekAtLastError() ); // Checks for launch error
cudaErrorCheck( cudaThreadSynchronize() ); // Checks for execution error

我的建议是为您的代码添加彻底的错误检查，然后返回并根据您获得的结果编辑您的问题。然后有人可能能够提供有关正在发生的事情的具体建议。

Answer 2

我认为你没有正确使用<<< >>>语法。

这是来自CUDA Programming Guide：

的内核调用

MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);

这意味着网格尺寸应该先行。

对内核参数的最大大小也有限制。见this。如果你超越它，我不确定编译器是抱怨还是继续做坏事。

如果删除device_fb之外的所有参数，并在内核中设置device_fb[index]=index，我可以成功读取这些值。

CUDA - 为什么我的设备数据没有传输到主机？

2 个答案: