Question

使用CUDA Dynamic Parallelism时遇到性能问题。目前，CDP的表现比传统方法慢至少3倍。我们制作了最简单的可重现代码来显示这个问题，即将数组的所有元素的值增加+1。即，

a[0,0,0,0,0,0,0,.....,0] --> kernel +1 --> a[1,1,1,1,1,1,1,1,1]

这个简单示例的目的只是看看CDP是否可以像其他人一样执行，或者是否存在严重的开销。

代码在这里：

#include <stdio.h>
#include <cuda.h>
#define BLOCKSIZE 512

__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);


// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if(tid == 0){
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);

        kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
        kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
    }

    a[tid] += 1;
}


__global__ void kernel_simple(int *a, int n, int N, int offset){
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int pos = tid + offset;
    if(pos < N){
        a[pos] += 1;
    }
}

int main(int argc, char **argv){
    if(argc != 3){
        fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional)  \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
        exit(EXIT_FAILURE);
    }
    int N = atoi(argv[1])*3;
    int method = atoi(argv[2]);
    // init array as 0
    int *ah, *ad;
    printf("genarray of 3*N = %i.......", N); fflush(stdout);
    ah = (int*)malloc(sizeof(int)*N);
    for(int i=0; i<N; ++i){
        ah[i] = 0;
    }
    printf("done\n"); fflush(stdout);

    // malloc and copy array to gpu
    printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
    cudaMalloc(&ad, sizeof(int)*N);
    cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
    printf("done\n"); fflush(stdout);

    // kernel launch (timed)
    cudaStream_t s1, s2, s3;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
    cudaEvent_t start, stop;
    float rtime = 0.0f;
    cudaEventCreate(&start); 
    cudaEventCreate(&stop);
    printf("Kernel...........................", N); fflush(stdout);
    if(method == 0){
        // CLASSIC KERNEL LAUNCH
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block >>> (ad, N, N, 0);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else if(method == 1){
        // DYNAMIC PARALLELISM
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    else{
        // THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
        dim3 block(BLOCKSIZE, 1, 1);
        dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
        cudaEventRecord(start, 0);
        kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
        kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
        kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
        cudaDeviceSynchronize();
        cudaEventRecord(stop, 0);
    }
    printf("done\n"); fflush(stdout);


    printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
    cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
    printf("done\n"); fflush(stdout);

    printf("checking result.................."); fflush(stdout);
    for(int i=0; i<N; ++i){
        if(ah[i] != 1){
            fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
            exit(EXIT_FAILURE);
        }
    }
    printf("done\n"); fflush(stdout);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&rtime, start, stop);
    printf("rtime: %f ms\n", rtime); fflush(stdout);
    return EXIT_SUCCESS;
}

可以用

编译

nvcc -arch=sm_35 -rdc=true -lineinfo -lcudadevrt -use_fast_math main.cu -o prog

此示例可以使用3种方法计算结果：

简单内核：数组上只有一个经典内核+1传递。
动态并行：来自main（），调用在[0，N / 3]范围内执行+1的父内核，并调用两个子内核。第一个孩子在[N / 3,2 * N / 3]范围内+1，范围[2 * N / 3，N]中的第二个孩子。使用不同的流启动子节点，以便它们可以并发。
来自主机的三个流：这个只是从main（）启动三个非阻塞流，一个用于阵列的三分之一。

我得到方法0（简单内核）的以下配置文件：方法1的以下内容（动态并行）：方法2的以下内容（来自主机的三个流）运行时间是这样的：

➜  simple-cdp git:(master) ✗ ./prog 16777216 0
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.140928 ms
➜  simple-cdp git:(master) ✗ ./prog 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 5.790048 ms
➜  simple-cdp git:(master) ✗ ./prog 16777216 2
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.011936 ms

从图片中可以看出的主要问题是，在动态并行方法中，父内核在两个子内核完成后花费了过多的时间来关闭，这就是它需要3X或者4倍以上。即使在考虑最坏情况时，如果所有三个内核（父级和两个子级）都以串行方式运行，则应该花费更少。即，每个内核都有N / 3个工作，因此整个父内核应该占用大约3个子内核，这要少得多。 有没有办法解决这个问题？

Answer 1

对设备运行时的调用是非常昂贵的，就像调用主机运行时一样昂贵。在这种情况下，您似乎正在调用设备运行时为每个线程创建流，即使此代码仅需要它们用于线程0。

通过修改代码只请求为线程0创建流，我们可以在我们为子内核启动使用单独的流的情况和我们没有为子代使用单独的流的情况之间产生时序奇偶校验内核启动：

$ cat t370.cu #include <stdio.h> #define BLOCKSIZE 512 __global__ void kernel_parent(int *a, int n, int N); __global__ void kernel_simple(int *a, int n, int N, int offset); // N is the total array size // n is the worksize for a kernel (one third of N) __global__ void kernel_parent(int *a, int n, int N){ int tid = blockIdx.x * blockDim.x + threadIdx.x; if(tid == 0){ dim3 block(BLOCKSIZE, 1, 1); dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1); #ifdef USE_STREAMS cudaStream_t s1, s2; cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking); kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n); kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n); #else kernel_simple<<< grid, block >>> (a, n, N, n); kernel_simple<<< grid, block >>> (a, n, N, 2*n); #endif // these next 2 lines add noticeably to the overall timing cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("oops1: %d\n", (int)err); } a[tid] += 1; } __global__ void kernel_simple(int *a, int n, int N, int offset){ int tid = blockIdx.x * blockDim.x + threadIdx.x; int pos = tid + offset; if(pos < N){ a[pos] += 1; } } int main(int argc, char **argv){ if(argc != 3){ fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n"); exit(EXIT_FAILURE); } int N = atoi(argv[1])*3; int method = atoi(argv[2]); // init array as 0 int *ah, *ad; printf("genarray of 3*N = %i.......", N); fflush(stdout); ah = (int*)malloc(sizeof(int)*N); for(int i=0; i<N; ++i){ ah[i] = 0; } printf("done\n"); fflush(stdout); // malloc and copy array to gpu printf("cudaMemcpy:Host->Device..........", N); fflush(stdout); cudaMalloc(&ad, sizeof(int)*N); cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice); printf("done\n"); fflush(stdout); // kernel launch (timed) cudaStream_t s1, s2, s3; cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking); cudaEvent_t start, stop; float rtime = 0.0f; cudaEventCreate(&start); cudaEventCreate(&stop); printf("Kernel...........................", N); fflush(stdout); if(method == 0){ // CLASSIC KERNEL LAUNCH dim3 block(BLOCKSIZE, 1, 1); dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1); cudaEventRecord(start, 0); kernel_simple<<< grid, block >>> (ad, N, N, 0); cudaDeviceSynchronize(); cudaEventRecord(stop, 0); } else if(method == 1){ // DYNAMIC PARALLELISM dim3 block(BLOCKSIZE, 1, 1); dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1); cudaEventRecord(start, 0); kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N); cudaDeviceSynchronize(); cudaEventRecord(stop, 0); } else{ // THREE CONCURRENT KERNEL LAUNCHES USING STREAMS dim3 block(BLOCKSIZE, 1, 1); dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1); cudaEventRecord(start, 0); kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0); kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3); kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3)); cudaDeviceSynchronize(); cudaEventRecord(stop, 0); } printf("done\n"); fflush(stdout); printf("cudaMemcpy:Device->Host..........", N); fflush(stdout); cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost); printf("done\n"); fflush(stdout); printf("checking result.................."); fflush(stdout); for(int i=0; i<N; ++i){ if(ah[i] != 1){ fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]); exit(EXIT_FAILURE); } } printf("done\n"); fflush(stdout); cudaEventSynchronize(stop); cudaEventElapsedTime(&rtime, start, stop); printf("rtime: %f ms\n", rtime); fflush(stdout); return EXIT_SUCCESS; } $ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu $ ./t370 16777216 1 genarray of 3*N = 50331648.......done cudaMemcpy:Host->Device..........done Kernel...........................done cudaMemcpy:Device->Host..........done checking result..................done rtime: 6.925632 ms $ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu -DUSE_STREAMS $ ./t370 16777216 1 genarray of 3*N = 50331648.......done cudaMemcpy:Host->Device..........done Kernel...........................done cudaMemcpy:Device->Host..........done checking result..................done rtime: 6.673568 ms $

虽然没有包含在上面的测试输出中，但根据我的测试，这也将CUDA动态并行（CDP）情况（1）带入＆＃34;近似奇偶校验＆＃34;非CDP案例（0，2）。请注意，我们可以通过在父内核（我添加到您的代码中）中调用cudaGetLastError()来减少上述时间约1毫秒（！）。

Answer 2

SELECT  B.NAME
        ,MAX(case when DESCRIPTION = 'Green' then Final_Value end) as Start_Time
        ,MAX(case when DESCRIPTION = 'Red' then Final_Value end) as End_Time
          FROM mydata a
          JOIN ref_val B 
            ON A.ref_res = B.ref_res    
         WHERE B.COMPANY = 'abc'
         GROUP BY  B.NAME

To build/run:

Copy/paste code above to dpar.cu
nvcc -arch=sm_52 -rdc=true -std=c++11 -lcudadevrt -o dpar dpar.cu
./dpar

On my p5000 laptop it prints:

=== Single kernel launch ===
time: 0.014297 ms
=== 2x sequential kernel launch ===
time: 0.030468 ms
=== Nested kernel launch ===
time: 0.083820 ms

So the overhead is quite large.. looks like in my case 43 microseconds.

CUDA动态并行，性能不佳

2 个答案: