C CUDA从设备发送struct array到host

时间:2017-10-02 07:33:51

标签: c struct cuda

我有这个结构

struct Data {
    int x
    int y;
    float z;
};

我发送内核没有问题

__global__ void calculate(Data *d_data) {
    d_data[myCounter].x = 1;
    d_data[myCounter].y = 1;
    d_data[myCounter].z = 1.0;
}

#DEFINE MAX_SIZE 100

int main() {
  Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
  Data *d_data;

  const int DATA_BYTES = MAX_SIZE * sizeof(Data);
  int elements = 20;

  cudaError_t cudaStatus;

  cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
  }

  cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
  }

  calculate << < 1, elements >> > (d_data);

  cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
  }

  cudaDeviceSynchronize();

  for (i = 0; i < elements; i++) {
    printf("%2d %2.1f %s\n", d_data[i].x, d_data[i].y, 
    d_data[i].z);    // this prints nothing
  }
  cudaStatus = cudaDeviceReset();

}

当我在内核函数中测试struct array时,计算它打印出正确的结果但是当我尝试使用cudaMemcpy程序将数据从设备发送到主机时没有错误并且没有打印任何内容,我怎么能从设备传输这个struct数组?

2 个答案:

答案 0 :(得分:1)

您所显示的代码存在一些问题。

  1. 您的结构定义中缺少分号。
  2. 变量myCounter
  3. 的内核代码中未提供定义
  4. 没有为i
  5. 中的变量main提供定义
  6. 您正在尝试从设备变量d_data而不是主变量data进行打印。这在CUDA中是非法的。复制到主变量data后,从那里打印。
  7. 您使用的printf格式说明符不正确。结构中的数据类型为intintfloat。您使用的%2d %2.1f %s匹配intfloat和字符串变量(以空字符结尾的字符数组),但结构不正确。
  8. 以下代码解决了上述问题,似乎对我来说正确运行:

    $ cat t430.cu
    #include <stdio.h>
    
    struct Data {
        int x;  // was missing semicolon
        int y;
        float z;
    };
    
    __global__ void calculate(Data *d_data) {
        int myCounter = threadIdx.x;  // this line was missing
        d_data[myCounter].x = 1;
        d_data[myCounter].y = 1;
        d_data[myCounter].z = 1.0;
    }
    
    #define MAX_SIZE 100
    
    int main() {
      Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
      Data *d_data;
      int i;   // this line was missing
      const int DATA_BYTES = MAX_SIZE * sizeof(Data);
      int elements = 20;
    
      cudaError_t cudaStatus;
    
      cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
      if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
      }
    
      cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
      if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
      }
    
      calculate << < 1, elements >> > (d_data);
    
      cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
      if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
      }
    
      cudaDeviceSynchronize();
    
      for (i = 0; i < elements; i++) {
        printf("%2d %2d %2.1f\n", data[i].x, data[i].y,
        data[i].z);    // this was trying to print from d_data
      }
      cudaStatus = cudaDeviceReset();
    
    }
    $ nvcc -arch=sm_61 -o t430 t430.cu
    $ cuda-memcheck ./t430
    ========= CUDA-MEMCHECK
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
     1  1 1.0
    ========= ERROR SUMMARY: 0 errors
    $
    

答案 1 :(得分:-1)

您应该将此宏添加到您的代码中

#define CUDA_SAFE_CALL(call)                                               
do {                                                                  
    cudaError_t err = call;                                           
    if (cudaSuccess != err) {                                         
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.", 
                 __FILE__, __LINE__, cudaGetErrorString(err) );       
        exit(EXIT_FAILURE);                                          
    }                                                                
} while (0)

然后:

CUDA_SAFE_CALL(cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

顺便说一下,你的myCounter似乎不对。你能否在上面的代码中提供myCounter值的一些细节?