Question

我正在尝试实现一个包含数据数组的结构我想实现动态数组，如：

struct myStruct {
  float3 *data0, *data1;
};

__global__ void kernel(myStruct input) {
  unsigned int N = 2;
  while(someStatements) {
    data0 = new float3[N];
    // do somethings
    N *= 2;
  }
}

如何在CUDA内核中执行此类操作？

Answer 1

如果要在计算能力2.x或3，x设备上运行此代码，并使用最新版本的CUDA，则内核代码几乎是正确的。 Fermi和Kepler硬件上的CUDA 4.x和5.0支持C ++ new运算符。请注意，使用new或malloc分配的内存在设备的运行时堆上分配。它具有创建的上下文的生命周期，但您目前无法直接从CUDA主机API访问它（因此通过cudaMemcpy或类似的方式）。

我将您的结构和内核转换为一个简单的示例代码，您可以自己尝试看看它是如何工作的：

#include <cstdio>

struct myStruct {
    float *data;
};

__device__ 
void fill(float * x, unsigned int n)
{
    for(int i=0; i<n; i++) x[i] = (float)i;
}

__global__ 
void kernel(myStruct *input, const unsigned int imax)
{
    for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
        float * p = new float[N];
        fill(p, N);
        input[i].data = p;
    }
}

__global__
void kernel2(myStruct *input, float *output, const unsigned int imax)
{
    for(unsigned int i=0,N=1; i<imax; i++, N*=2) {
        output[i] = input[i].data[N-1];
    }
}

inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

int main(void)
{

    const unsigned int nvals = 16;
    struct myStruct * _s;
    float * _f, * f;

    gpuErrchk( cudaMalloc((void **)&_s, sizeof(struct myStruct) * size_t(nvals)) );
    size_t sz = sizeof(float) * size_t(nvals);
    gpuErrchk( cudaMalloc((void **)&_f, sz) );
    f = new float[nvals];

    kernel<<<1,1>>>(_s, nvals);
    gpuErrchk( cudaPeekAtLastError() );

    kernel2<<<1,1>>>(_s, _f, nvals);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaMemcpy(f, _f, sz, cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaDeviceReset() );

    for(int i=0; i<nvals; i++) {
        fprintf(stdout, "%d %f\n", i, f[i]);
    }

    return 0;
}

需要注意几点：

此代码仅在Fermi或Kepler GPU上使用CUDA 4.x或5.0进行编译和运行
您必须将GPU的正确架构传递给nvcc才能编译它（例如我使用nvcc -arch=sm_30 -Xptxas="-v" -o dynstruct dynstruct.cu在Linux上编译GTX 670）
示例代码使用“聚集”内核将数据从运行时堆中的结构复制到主机API可以访问的分配，以便可以打印出结果。这是我之前提到的关于cudaMemcpy无法直接从运行时堆内存中的地址复制的限制的解决方法。我希望这可以在CUDA 5.0中修复，但最新的候选版本仍然有这个限制。

CUDA，如何在CUDA内核中实现struct的动态数组

1 个答案: