如何并行运行Cuda内核调用和CPU函数?

时间:2018-10-11 15:40:17

标签: c++ cuda openmp

我有一个程序使用CUDA在GPU上运行,该程序带有许多小内核,这意味着CPU上的内核调用与GPU上的内核执行大约需要相同的时间。

我想在我的程序循环中添加一个CPU函数,该函数与所有内核的一次迭代大约需要相同的时间。我知道在内核启动之后,CPU可以与GPU异步工作,但是由于我上一次内核启动在完成GPU工作之前并不多,因此在这种情况下这是没有选择的。

所以,我的想法是使用多个线程: 一个线程启动我的GPU内核,另一个线程(或多个其他内核)执行CPU功能并并行运行这两个内核。

我创建了一个小例子来测试这个想法:

#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define THREADS_PER_BLOCK 64

__global__ void k_dummykernel1(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float ai = a[id];
        float bi = b[id];

        c[id] = powf(expf(bi*sinf(ai)),1.0/bi);
    }
}

__global__ void k_dummykernel2(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float bi = b[id];

        c[id] = powf(c[id],bi);
    }
}

__global__ void k_dummykernel3(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float bi = b[id];

        c[id] = logf(c[id])/bi;
    }
}

__global__ void k_dummykernel4(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {

        c[id] = asinf(c[id]);
    }
}

int main()
{
    int N = 10000;
    int N2 = N/5;

    float *a = new float[N];
    float *b = new float[N];
    float *c = new float[N];

    float *d_a,*d_b,*d_c;

    for(int i = 0; i < N; i++)
    {
        a[i] = (10*(1+i))/(float)N;
        b[i] = (i+1)/50.0;
    }



    cudaMalloc((void**)&d_a,N*sizeof(float));
    cudaMalloc((void**)&d_b,N*sizeof(float));
    cudaMalloc((void**)&d_c,N*sizeof(float));

    cudaMemcpy(d_a, a ,N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b ,N*sizeof(float), cudaMemcpyHostToDevice);


    cudaProfilerStart();


    for(int k = 0; k < 100; k++)
    {

        k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

        k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

        for(int i = 0; i < N2; i++)
        {
            c[i] = pow(a[i],b[i]);
        }

    }

    cudaDeviceSynchronize();
    usleep(40000);

    for(int k = 0; k <= 100; k++)
    {

#pragma omp parallel sections num_threads(2)
        {
#pragma omp section
            {
                k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

                k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            }

#pragma omp section
            {
                for(int i = 0; i < N2; i++)
                {
                    c[i] = pow(a[i],b[i]);
                }
            }
        }
    }

    cudaDeviceSynchronize();

    cudaProfilerStop();

    delete[] a;
    delete[] b;
    delete[] c;

    cudaFree((void*)d_a);
    cudaFree((void*)d_b);
    cudaFree((void*)d_c);
}

我使用nvcc main.cu -O3 -Xcompiler -fopenmp

进行编译

首先,我运行2x4内核,并按顺序运行CPU计算,然后,我尝试使用OpenMP部分并行进行此操作。

这是分析器中的结果: Complete Profiler Timeline

并行版本比顺序版本要慢得多...

如果我放大顺序部分,它看起来像这样: Sequential Timeline

可以看到,每8个内核启动之间就存在一个间隙,在该间隙中完成了cpu计算(我想通过与内核调用重叠来结束该计算)。

如果我放大平行部分(相同的缩放级别!),则如下所示: Parallel Timeline

不再存在任何差距,但是内核启动现在大约需要15微秒(之前是5微秒)。

我还尝试了更大的数组大小和std::thread而不是OpenMP,但问题始终与以前相同。

有人可以告诉我,是否甚至可以上班,如果可以,我在做什么错了?

预先感谢

1 个答案:

答案 0 :(得分:2)

我没有得到像您一样出色的结果,所以我不确定这是否对您有帮助。我看到第二个线程的API调用较慢,因此确保只有一个线程可以处理所有CUDA API调用,从而在某种程度上改善了结果。通常这是一个好主意,正如您所看到的那样,本节并不适合您。一个简单的方法是这样:

#pragma omp parallel num_threads(2)
{
    for(int k = 0; k <= KMAX; k++)
    {
        if (omp_get_thread_num() == 0)
        {
            k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

            k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        }
        else
        {
            for(int i = 0; i < N2; i++)
            {
                c[i] = pow(a[i],b[i]);
            }
        }
        // this makes sure that the behavior is consistent
        #pragma omp barrier
    }
}

请注意,我还将并行部分移到了循环之外,以减少线程管理开销。

要考虑的另一方面是性能监视确实会影响您的性能,尤其是对于这些非常短的API调用而言。我添加了计时并将k循环增加到1000,然后从控制台获得以下结果:

Serial time:                   0.036724
Parallel time (pinned thread): 0.021165
Parallel time (sections):      0.027331

有了nvprof,我得到:

Serial time:                   0.058805
Parallel time (pinned thread): 0.054116
Parallel time (sections):      0.053535

因此,基本上,您必须在可视化探查器中将结果与大量盐混合在一起。详细跟踪的见解通常非常有用,但是在这种情况下,您应该依靠端到端的测量。