Question

我正在尝试测量GPU在内核函数中使用'malloc'分配内存与在主机上使用'cudaMalloc'预分配存储之间的性能差异。为此，我有两个内核函数，一个使用malloc，一个使用预先分配的数组，并且我重复执行每个函数。

问题是每个内核函数的第一次执行需要400到2500微秒，但所有后续运行大约需要15到30微秒。

这种行为是预料到的，还是我目睹了之前运行的某种遗留效应？如果这是遗留物，我该怎么做才能阻止它呢？

我尝试过一个内核函数，在每次定时测试运行之间将GPU上的所有内存归零，以消除这种遗留，但没有任何改变。我也尝试颠倒我运行测试的顺序，这对相对或绝对执行时间没有影响。

const int TEST_SIZE = 1000;

struct node {
    node* next;
    int data;
};


int main() {

    int numTests = 5;

    for (int i = 0; i < numTests; ++i) {
        memClear();
        staticTest();

        memClear();
        dynamicTest();
    }
    return 0;
}

__global__ void staticMalloc(int* sum) {
    // start a linked list
    node head[TEST_SIZE];

    // initialize nodes
    for (int j = 0; j < TEST_SIZE; j++) {
        // allocate the node & assign values
        head[j].next = NULL;
        head[j].data = j;
    }

    // verify creation by adding up values
    int total = 0;
    for (int j = 0; j < TEST_SIZE; j++) {
        total += head[j].data;
    }
    sum[0] = total;
}

/**
 * This is a test that will time execution of static allocation
 */
int staticTest() {

    int expectedValue = 0;
    for (int i = 0; i < TEST_SIZE; ++i) {
        expectedValue += i;
    }

    // host output vector
    int* h_sum = new int[1];
    h_sum[0] = -1;

    // device output vector
    int* d_sum;

    // vector size
    size_t bytes = sizeof(int);

    // allocate memory on device
    cudaMalloc(&d_sum, bytes);

    // only use 1 CUDA thread
    dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);

    Timer runTimer;

    int runTime = 0;

    // check dynamic allocation time
    runTime = 0;

    runTimer.start();
    staticMalloc<<<gridsize, blocksize>>>(d_sum);
    runTime += runTimer.lap();

    h_sum[0] = 0;
    cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);

    cudaFree(d_sum);
    delete (h_sum);

    return 0;
}

__global__ void dynamicMalloc(int* sum) {

    // start a linked list
    node* headPtr = (node*) malloc(sizeof(node));
    headPtr->data = 0;
    headPtr->next = NULL;

    node* curPtr = headPtr;

    // add nodes to test cudaMalloc in device
    for (int j = 1; j < TEST_SIZE; j++) {

        // allocate the node & assign values
        node* nodePtr = (node*) malloc(sizeof(node));
        nodePtr->data = j;
        nodePtr->next = NULL;

        // add it to the linked list
        curPtr->next = nodePtr;
        curPtr = nodePtr;
    }

    // verify creation by adding up values
    curPtr = headPtr;
    int total = 0;
    while (curPtr != NULL) {
        // add and increment current value
        total += curPtr->data;
        curPtr = curPtr->next;

        // clean up memory
        free(headPtr);
        headPtr = curPtr;
    }

    sum[0] = total;
}

/**
 * Host function that prepares data array and passes it to the CUDA kernel.
 */
int dynamicTest() {

    // host output vector
    int* h_sum = new int[1];
    h_sum[0] = -1;

    // device output vector
    int* d_sum;

    // vector size
    size_t bytes = sizeof(int);

    // allocate memory on device
    cudaMalloc(&d_sum, bytes);

    // only use 1 CUDA thread
    dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);

    Timer runTimer;

    int runTime = 0;

    // check dynamic allocation time
    runTime = 0;

    runTimer.start();
    dynamicMalloc<<<gridsize, blocksize>>>(d_sum);
    runTime += runTimer.lap();

    h_sum[0] = 0;
    cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);

    cudaFree(d_sum);
    delete (h_sum);

    return 0;
}

__global__ void clearMemory(char *zeros) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    zeros[i] = 0;
}

void memClear() {

    char *zeros[1024]; // device pointers

    for (int i = 0; i < 1024; ++i) {
        cudaMalloc((void**) &(zeros[i]), 4 * 1024 * 1024);
        clearMemory<<<1024, 4 * 1024>>>(zeros[i]);
    }

    for (int i = 0; i < 1024; ++i) {
        cudaFree(zeros[i]);
    }
}

Answer 1

内核的第一次执行需要更多时间，因为你必须在GPU上加载很多东西（内核，lib等......）。为了证明这一点，你可以只测量启动空内核所需的时间，你会发现需要花费一些时间。试试：

time -> start

launch emptykernel

time -> end

firstTiming = end - start 

time -> start

launch empty kernel

time -> end

secondTiming = end - start

您会看到secondTiming明显小于firstTiming。

Answer 2

第一个CUDA（内核）调用透明地初始化CUDA系统。您可以先通过调用空内核来避免这种情况。请注意，这在例如OpenCL，但你必须手动完成所有init-stuff。 CUDA会在后台为你做这件事。

然后你的时间有些问题：CUDA内核调用是异步的。所以（假设您的Timer类是像time()这样的主机计时器），目前您测量内核启动时间（以及第一次调用CUDA的初始化时间）而不是内核执行时间。在启动 AND 停止计时器之前，您至少需要cudaDeviceSynchronize()。

您最好使用可以精确测量内核执行时间的CUDA事件。使用主机定时器仍然包括启动开销。见https://devblogs.nvidia.com/parallelforall/how-implement-performance-metrics-cuda-cc/

多次调用CUDA内核会影响执行速度吗？

2 个答案: