Question

我有一个包含主机功能和设备功能的CUDA程序 执行（） 。在主机功能中，我分配一个全局内存输出，然后将其传递给设备功能并用于存储在设备功能中分配的全局内存的地址。我想访问主机函数中的内核分配内存。以下是代码：

#include <stdio.h>
typedef struct                      
{
  int             * p;            
  int              num;            
} Structure_A;

__global__ void Execute(Structure_A *output);

int main(){

    Structure_A *output;
    cudaMalloc((void***)&output,sizeof(Structure_A)*1);
    dim3 dimBlockExecute(1,1);
    dim3 dimGridExecute(1,1);
    Execute<<<dimGridExecute,dimBlockExecute>>>(output);
    Structure_A * output_cpu;
    int * p_cpu;
    cudaError_t err;

    output_cpu= (Structure_A*)malloc(sizeof(Structure_A));
    err=cudaMemcpy(output_cpu,output,sizeof(Structure_A),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error a: %s\n", cudaGetErrorString(err));
        exit(-1);
    }
    p_cpu=(int *)malloc(sizeof(int));
    err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error b: %s\n", cudaGetErrorString(err));
        exit(-1);
    }   
    printf("output=(%d,%d)\n",output_cpu[0].num,p_cpu[0]);
    return 0;
}

__global__ void Execute(Structure_A *output){

    int thid=threadIdx.x;

    output[thid].p= (int*)malloc(thid+1);

    output[thid].num=(thid+1);

    output[thid].p[0]=5;
}

我可以编译程序。但是当我运行它时，我得到一个错误，表明以下内存复制功能中存在无效参数：

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);

CUDA版本是4.2。 CUDA卡：特斯拉C2075 操作系统：x86_64 GNU / Linux

编辑：修改代码并为output_cpu和p_cpu分配适当的内存大小。

Answer 1

这段代码有很多问题。例如，您只在这两行中分配1个字节，不足以容纳Structure_A的单个实例。

output_cpu= (Structure_A*)malloc(1);
p_cpu=(int *)malloc(1);

但是，您错误的直接原因是您正在从设备运行时堆分配的指针（即在设备代码中分配malloc或new）执行memcpy到主机指针。

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);

不幸的是，cudaMalloc，cudaFree和cudaMemcpy的主机运行时API目前与设备运行时堆上分配的内存不兼容。

如何将设备功能中分配的内存复制回主内存

1 个答案: