Question

我需要在内核函数中动态分配一些数组。我怎么能这样做？

我的代码是这样的：

__global__ func(float *grid_d,int n, int nn){  
    int i,j;  
    float x[n],y[nn];  
    //Do some really cool and heavy computations here that takes hours.  
}

但这不起作用。如果这是在主机代码中我可以使用malloc。 cudaMalloc需要主机上的指针，以及设备上的其他指针。在内核函数内部，我没有主机指针。

那么，我该怎么办？

如果花费太长时间（几秒钟）来分配所有数组（我需要大约n的4和大小为nn的5），这将不是问题。因为内核可能至少运行20分钟。

Answer 1

仅计算能力2.x和更新的硬件支持动态内存分配。您可以在内核中使用C ++ new关键字或malloc，因此您的示例可能变为：

__global__ func(float *grid_d,int n, int nn){  
    int i,j;  
    float *x = new float[n], *y = new float[nn];   
}

这会在具有上下文生命周期的本地内存运行时堆上分配内存，因此如果您打算不再使用内存，请确保在内核完成运行后释放内存。您还应注意，无法直接从主机API访问运行时堆内存，因此您无法将在内核中分配的指针作为参数传递给cudaMemcpy，例如。

Answer 2

@talonmies回答了关于如何在内核中动态分配内存的问题。这旨在作为补充答案，解决__device__ malloc()的性能以及您可能想要考虑的替代方案。

在内核中动态分配内存可能很诱人，因为它允许GPU代码看起来更像CPU代码。但它会严重影响性能。我写了一个自包含的测试，并将其包含在下面。该测试启动了约260万个线程。每个线程使用从线程索引派生的一些值填充16个全局内存的整数，然后将值相加并返回总和。

该测试实现了两种方法。第一种方法使用__device__ malloc()，第二种方法使用在内核运行之前分配的内存。

在我的2.0设备上，内核在使用__device__ malloc()时运行1500ms，在使用预先分配的内存时运行27ms。换句话说，当在内核中动态分配内存时，测试需要 56x 才能运行。时间包括外循环cudaMalloc() / cudaFree()，它不是内核的一部分。如果使用相同数量的线程多次启动相同的内核（通常是这种情况），cudaMalloc() / cudaFree()的成本将在所有内核启动时分摊。这使差异更大，达到60倍左右。

推测，我认为性能损失部分是由隐式序列化引起的。 GPU必须将所有同时调用序列化为__device__ malloc()，以便为每个调用者提供单独的内存块。

不使用__device__ malloc()的版本在运行内核之前分配所有GPU内存。指向内存的指针传递给内核。每个线程计算先前分配的内存的索引，而不是使用__device__ malloc()。

预先分配内存的潜在问题是，如果只有一些线程需要分配内存，并且不知道它们是哪些线程，则需要为所有线程分配内存。如果内存不足，那么使用__device__ malloc()减少每个内核调用的线程数可能更有效。其他解决方法可能最终会重新实现__device__ malloc()在后台执行的操作，并会看到类似的性能损失。

测试__device__ malloc()：

的效果

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

const int N_ITEMS(16);

#define USE_DYNAMIC_MALLOC

__global__ void test_malloc(int* totals)
{
  int tx(blockIdx.x * blockDim.x + threadIdx.x);

  int* s(new int[N_ITEMS]);

  for (int i(0); i < N_ITEMS; ++i) {
    s[i] = tx * i;
  }

  int total(0);
  for (int i(0); i < N_ITEMS; ++i) {
    total += s[i];
  }

  totals[tx] = total;

  delete[] s;
}

__global__ void test_malloc_2(int* items, int* totals)
{
  int tx(blockIdx.x * blockDim.x + threadIdx.x);

  int* s(items + tx * N_ITEMS);

  for (int i(0); i < N_ITEMS; ++i) {
    s[i] = tx * i;
  }

  int total(0);
  for (int i(0); i < N_ITEMS; ++i) {
    total += s[i];
  }

  totals[tx] = total;
}

int main()
{
  cudaError_t cuda_status;

  cudaSetDevice(0);

  int blocks_per_launch(1024 * 10);
  int threads_per_block(256);

  int threads_per_launch(blocks_per_launch * threads_per_block);

  int* totals_d;
  cudaMalloc((void**)&totals_d, threads_per_launch * sizeof(int));

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaDeviceSynchronize();
  cudaEventRecord(start, 0);

#ifdef USE_DYNAMIC_MALLOC
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, threads_per_launch * N_ITEMS * sizeof(int));

  test_malloc<<<blocks_per_launch, threads_per_block>>>(totals_d);
#else
  int* items_d;
  cudaMalloc((void**)&items_d, threads_per_launch * sizeof(int) * N_ITEMS);

  test_malloc_2<<<blocks_per_launch, threads_per_block>>>(items_d, totals_d);

  cudaFree(items_d);
#endif

  cuda_status = cudaDeviceSynchronize();
  if (cuda_status != cudaSuccess) {
    printf("Error: %d\n", cuda_status);
    exit(1);
  }

  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  float elapsedTime;
  cudaEventElapsedTime(&elapsedTime, start, stop);

  printf("Elapsed: %f\n", elapsedTime);

  int* totals_h(new int[threads_per_launch]);
  cuda_status = cudaMemcpy(totals_h, totals_d, threads_per_launch * sizeof(int), cudaMemcpyDeviceToHost);
  if (cuda_status != cudaSuccess) {
    printf("Error: %d\n", cuda_status);
    exit(1);
  }

  for (int i(0); i < 10; ++i) {
    printf("%d ", totals_h[i]);
  }
  printf("\n");

  cudaFree(totals_d);
  delete[] totals_h;

  return cuda_status;
}

输出：

C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 27.311169
0 120 240 360 480 600 720 840 960 1080

C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 1516.711914
0 120 240 360 480 600 720 840 960 1080

Answer 3

如果在调用内核之前已知n和nn的值，那么为什么cudaMalloc主机端的内存并将设备内存指针传递给内核？

Answer 4

根据@ rogerdahl帖子中的概念进行实验。假设：

以64B块分配4MB内存。
该块中有1个GPU块和32个经线
在P100上运行

GPU本地的malloc +免费通话似乎比cudaMalloc + cudaFree通话快得多。该计划的输出：

Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
         timer for cuda malloc timer took 1.169631s
Starting timer for device malloc timer
Stopping timer for device malloc timer
         timer for device malloc timer took 0.029794s

我忽略了timer.h和timer.cpp的代码，但这里是测试本身的代码：

#include "cuda_runtime.h"
#include <stdio.h>
#include <thrust/system/cuda/error.h>

#include "timer.h"

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 32;
const int ITERATIONS = 1 << 12;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);

const int ARRAY_SIZE = 64;


void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

__global__ void mallocai() {
    for (int i = 0; i < ITERATIONS_PER_BLOCKTHREAD; ++i) {
        int * foo;
        foo = (int *) malloc(sizeof(int) * ARRAY_SIZE);
        free(foo);
    }
}

int main() {

    Timer cuda_malloc_timer("cuda malloc timer");

    for (int i = 0; i < ITERATIONS; ++ i) {
        if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
        int * foo;
        cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
        cudaFree(foo);
    }
    cuda_malloc_timer.stop_and_report();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());

    Timer device_malloc_timer("device malloc timer");
    device_malloc_timer.start();
    mallocai<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>();
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    device_malloc_timer.stop_and_report();
}

如果您发现错误，请参阅评论，我会尝试修复它们。

我用更大的一切再次运行它们：

const int BLOCK_COUNT = 56;
const int THREADS_PER_BLOCK = 1024;
const int ITERATIONS = 1 << 18;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);

const int ARRAY_SIZE = 1024;

cudaMalloc的速度仍然很慢：

Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
         timer for cuda malloc timer took 74.878016s
Starting timer for device malloc timer
Stopping timer for device malloc timer
         timer for device malloc timer took 0.167331s

Answer 5

Maybe you should test

cudaMalloc(&foo,sizeof(int) * ARRAY_SIZE * ITERATIONS);
cudaFree(foo);

instead

for (int i = 0; i < ITERATIONS; ++ i) {
    if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
    int * foo;
    cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
    cudaFree(foo);
}

如何在内核中动态分配数组？

5 个答案: