Question

我在C中有一个结构，其中包含结构数组，我需要在GPU中复制它。为此，我正在编写一个函数，该函数使结构中的一些cudaMalloc和cudaMemcpy从主机到设备。

结构的一个简单版本（实际版本中具有各种结构和变量/数组）是：

struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};

我的问题是我在内存分配和结构副本中一定做错了。当我使用Graph复制变量时，可以看到它们已正确复制（通过在内核中进行访问，如下例所示）。例如，我可以检查graph.nBoundary=3。

但是，只有在不分配和复制Node *的内存的情况下，我才能看到此消息。如果可以，我得到-858993460而不是3。有趣的是，Node *的分配没有错误，因为我可以检查graph.node[0].pos[0]的值，并且它具有正确的值。

只有graph.nBoundary会发生这种情况。所有其他变量都保留正确的数值，但是在运行cudaMemcpy的{{1}}时，这个变量会“出错”。

我在做什么错，为什么会这样？我该如何解决？

让我知道是否需要更多信息。

MCVE：

Node*

Answer 1

问题出在功能cudaGraphMalloc上，您正在其中尝试将设备内存分配给已经在设备上分配的outGraph的成员。在此过程中，您正在取消引用主机上的设备指针，这是非法的。

要将设备内存分配给设备上存在的struct类型变量的成员，我们首先必须创建该struct类型的临时主机变量，然后将设备内存分配给其成员，并且然后将其复制到设备上存在的结构中。

我已经回答了类似的问题here。请看一下。

固定代码如下：

#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph {
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph, unsigned int * d_res) {
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++) {
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++) {
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%u\n", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph) 
{
    //Create auxiliary Graph variable on host
    Graph temp;

    //copy constants
    temp.nNode = inGraph->nNode;
    temp.nBoundary = inGraph->nBoundary;

    // copy boundary
    gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));


    //Create nodes 
    size_t nodeBytesTotal = temp.nNode * sizeof(Node);
    gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));

    for (int i = 0; i < temp.nNode; i++)
    {
        //Create auxiliary node on host
        Node auxNodeHost;

        //Allocate device memory to position member of auxillary node
        size_t nodeBytes = 3 * sizeof(float);
        gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
        gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));

        //Copy auxillary host node to device
        Node* dPtr = temp.node + i;
        gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
    }


    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
    gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));

    return outGraph;
}

请注意，您将必须保留内部设备指针的主机副本（即辅助主机变量）。这是因为您稍后必须释放设备内存，并且由于在主代码中只有Graph的设备副本，因此您将无法从主机访问其成员以调用{{1 }} 在他们。在这种情况下，变量cudaFree（在每次迭代中创建）和Node auxNodeHost就是那些变量。

上面的代码并没有这样做，只是出于演示目的。

在Windows 10，Visual Studio 2015，CUDA 9.2，NVIDIA驱动程序397.44上进行了测试。

在CUDA中分配结构数组后变量丢失

1 个答案: