Question

我想知道是否有人可以通过内核中的new运算符阐明这种行为。以下是代码

#include <stdio.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
using namespace std;
__global__ void test()
{

    cuComplex *store;
    store= new cuComplex[30000];
    if (store==NULL) printf("Unable to allocate %i\n",blockIdx.y);
    delete store;
    if (threadIdx.x==10000) store->x=0.0;
}

int main(int argc, char *argv[])
{
    float timestamp;
    cudaEvent_t event_start,event_stop;
    // Initialise


    cudaEventCreate(&event_start);
    cudaEventCreate(&event_stop);
    cudaEventRecord(event_start, 0);
    dim3 threadsPerBlock;
    dim3 blocks;
    threadsPerBlock.x=1;
    threadsPerBlock.y=1;
    threadsPerBlock.z=1;
    blocks.x=1;
    blocks.y=500;
    blocks.z=1;

    cudaEventRecord(event_start);
    test<<<blocks,threadsPerBlock,0>>>();
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test took  %fms \n", timestamp);
}

在GTX680上运行这个Cuda 5并调查输出会发现没有分配随机内存:(我想也许是因为所有全局内存都已完成但我有2GB内存并且因为最大量的活动块为16，使用此方法分配的内存量最多应为16 * 30000 * 8 = 38.4x10e6 ..即大约38Mb。那么我还应该考虑什么呢？

Answer 1

问题与malloc（）和free（）设备系统调用使用的堆的大小有关。有关详细信息，请参阅NVIDIA CUDA C编程指南中的section 3.2.9 Call Stack和appendix B.16.1 Heap Memory Allocation。

如果您将堆大小设置为符合内核要求，则您的测试将起作用

    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 500*30000*sizeof(cuComplex));

内核中的new运算符。奇怪的行为

1 个答案: