Question

我希望有一个Container类的实例在初始化时分配一些设备和主机内存。我想在设备代码中使用已分配的内存，而不传递实际指针（出于API原因）。

如何创建指向设备内存成员的全局__device__指针？如果有帮助，我很乐意使用推力。

这是一个小例子：

#include <iostream>


struct Container {
    int *h_int = (int*)malloc(4*sizeof(int));
    int *d_int;
    Container() {
        h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
        cudaMalloc(&d_int, 4*sizeof(int));
        memcpyHostToDevice();
    }
    void memcpyHostToDevice() {
        cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
    }
    void memcpyDeviceToHost() {
        cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
    }
};

Container stuff;


__device__ auto d_int = &stuff.d_int;  // How do I get that right?


__global__ void edit() {  // To keep the API simple I do not want to pass the pointer
    auto i = blockIdx.x*blockDim.x + threadIdx.x;
    d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}


int main(int argc, char const *argv[]) {
    edit<<<4, 1>>>();
    stuff.memcpyDeviceToHost();
    std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
    return 0;
}

Answer 1

这里有两个问题：

您无法以您尝试的方式静态初始化__device__变量（并且您尝试应用的值也不正确）。 CUDA运行时API包含用于初始化全局范围设备符号的函数。请改用它。
stuff的全局范围声明不应该因为here讨论的一些微妙原因而起作用（这是技术上未定义的行为）。而是在main范围内声明它。

将这两件事放在一起应该会导致你做这样的事情：

__device__ int* d_int;

// ...

int main(int argc, char const *argv[]) {

    Container stuff;
    cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));

    edit<<<4, 1>>>();

    // ...

这是一个完整的例子：

$ cat t1199.cu
#include <iostream>


struct Container {
    int *h_int = (int*)malloc(4*sizeof(int));
    int *d_int;
    Container() {
        h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
        cudaMalloc(&d_int, 4*sizeof(int));
        memcpyHostToDevice();
    }
    void memcpyHostToDevice() {
        cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
    }
    void memcpyDeviceToHost() {
        cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
    }
};

//Container stuff;


__device__ int  *d_int; // = &stuff.d_int;  // How do I get that right?


__global__ void edit() {  // To keep the API simple I do not want to pass the pointer
    auto i = blockIdx.x*blockDim.x + threadIdx.x;
    d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}


int main(int argc, char const *argv[]) {
    Container stuff;
    cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
    edit<<<4, 1>>>();
    stuff.memcpyDeviceToHost();
    std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
    return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$

使用指向设备代码中的设备内存的主机类成员

1 个答案: