在cuda上矢量步加速慢

时间:2013-03-04 04:54:39

标签: c++ cuda parallel-processing gpu gpgpu

我正在尝试在CUDA C ++代码上运行向量步骤添加功能,但对于大小为5,000,000的大型浮点数组,它运行速度比我的CPU版本慢。以下是我所说的相关CUDA和cpu代码:

#define THREADS_PER_BLOCK 1024
typedef float real;
__global__ void vectorStepAddKernel2(real*x, real*y, real*z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
    {
        x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
    }
}

cudaError_t vectorStepAdd2(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{

    cudaError_t cudaStatus;
    int threadsPerBlock = THREADS_PER_BLOCK;
    int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
    vectorStepAddKernel2<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size, xstep, ystep, zstep);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching vectorStepAddKernel!\n", cudaStatus);
        exit(1);
    }

    return cudaStatus;
}

//CPU function:

void vectorStepAdd3(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
    for(int i=0;i<size;i++)
    {
        x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
    }
}

调用vectorStepAdd2导致计算速度比vectorStepAdd3慢,当3个数组中的每个数组大小为5,000,000且大小= 50,000时(即,以这种逐步方式将50,000个元素加在一起)。

关于如何加速GPU代码的任何想法? 我的设备是特斯拉M2090 GPU

由于

1 个答案:

答案 0 :(得分:5)

回答您的问题“我可以采取哪些措施来加速GPU代码?”

首先让我先说明提议的操作X = alpha * Y + beta * Z每个字节的数据传输所需的计算强度不大。结果,我无法在这个特定代码上超过CPU时间。但是,为了加快这段代码,可能需要考虑2个想法:

  1. 使用page-locked内存进行数据传输操作。这为GPU版本的数据传输时间减少了大约2倍,这主宰了GPU版本的整体执行时间。

  2. 使用@njuffa cudaMemcpy2D建议的here跨步复制技术。结果是2倍:我们可以将数据传输量减少到只有计算所需的数量,然后我们可以重新编写内核以连续操作数据,如评论(再次由njuffa)。这使得数据传输时间进一步提高了3倍,内核计算时间提高了约10倍。

  3. 此代码提供了这些操作的示例:

    #include <stdio.h>
    #include <stdlib.h>
    
    
    #define THREADS_PER_BLOCK 1024
    #define DSIZE 5000000
    #define WSIZE 50000
    #define XSTEP 47
    #define YSTEP 43
    #define ZSTEP 41
    #define TOL 0.00001f
    
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    typedef float real;
    
    __global__ void vectorStepAddKernel2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i < size)
        {
            x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
        }
    }
    
    __global__ void vectorStepAddKernel2i(real *x, real *y, real *z, real alpha, real beta, int size)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i < size)
        {
            x[i] = alpha* y[i] + beta*z[i];
        }
    }
    
    void vectorStepAdd2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
    {
    
        int threadsPerBlock = THREADS_PER_BLOCK;
        int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
        vectorStepAddKernel2<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size, xstep, ystep, zstep);
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel2 fail");
    }
    
    
    void vectorStepAdd2i(real *x, real *y, real *z, real alpha, real beta, int size)
    {
    
        int threadsPerBlock = THREADS_PER_BLOCK;
        int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
        vectorStepAddKernel2i<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size);
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel3 fail");
    }
    
    //CPU function:
    
    void vectorStepAdd3(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
    {
        for(int i=0;i<size;i++)
        {
            x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
        }
    }
    
    int main() {
    
      real *h_x, *h_y, *h_z, *c_x, *h_x1;
      real *d_x, *d_y, *d_z, *d_x1, *d_y1, *d_z1;
    
      int dsize = DSIZE;
      int wsize = WSIZE;
      int xstep = XSTEP;
      int ystep = YSTEP;
      int zstep = ZSTEP;
      real alpha = 0.5f;
      real beta = 0.5f;
      float et;
    
    /*
      h_x = (real *)malloc(dsize*sizeof(real));
      if (h_x == 0){printf("malloc1 fail\n"); return 1;}
      h_y = (real *)malloc(dsize*sizeof(real));
      if (h_y == 0){printf("malloc2 fail\n"); return 1;}
      h_z = (real *)malloc(dsize*sizeof(real));
      if (h_z == 0){printf("malloc3 fail\n"); return 1;}
      c_x = (real *)malloc(dsize*sizeof(real));
      if (c_x == 0){printf("malloc4 fail\n"); return 1;}
      h_x1 = (real *)malloc(dsize*sizeof(real));
      if (h_x1 == 0){printf("malloc1 fail\n"); return 1;}
    */
    
      cudaHostAlloc((void **)&h_x, dsize*sizeof(real), cudaHostAllocDefault);
      cudaCheckErrors("cuda Host Alloc 1 fail");
      cudaHostAlloc((void **)&h_y, dsize*sizeof(real), cudaHostAllocDefault);
      cudaCheckErrors("cuda Host Alloc 2 fail");
      cudaHostAlloc((void **)&h_z, dsize*sizeof(real), cudaHostAllocDefault);
      cudaCheckErrors("cuda Host Alloc 3 fail");
      cudaHostAlloc((void **)&c_x, dsize*sizeof(real), cudaHostAllocDefault);
      cudaCheckErrors("cuda Host Alloc 4 fail");
      cudaHostAlloc((void **)&h_x1, dsize*sizeof(real), cudaHostAllocDefault);
      cudaCheckErrors("cuda Host Alloc 5 fail");
    
    
      cudaMalloc((void **)&d_x, dsize*sizeof(real));
      cudaCheckErrors("cuda malloc1 fail");
      cudaMalloc((void **)&d_y, dsize*sizeof(real));
      cudaCheckErrors("cuda malloc2 fail");
      cudaMalloc((void **)&d_z, dsize*sizeof(real));
      cudaCheckErrors("cuda malloc3 fail");
      cudaMalloc((void **)&d_x1, wsize*sizeof(real));
      cudaCheckErrors("cuda malloc4 fail");
      cudaMalloc((void **)&d_y1, wsize*sizeof(real));
      cudaCheckErrors("cuda malloc5 fail");
      cudaMalloc((void **)&d_z1, wsize*sizeof(real));
      cudaCheckErrors("cuda malloc6 fail");
    
      for (int i=0; i< dsize; i++){
        h_x[i] = 0.0f;
        h_x1[i] = 0.0f;
        c_x[i] = 0.0f;
        h_y[i] = (real)(rand()/(real)RAND_MAX);
        h_z[i] = (real)(rand()/(real)RAND_MAX);
        }
    
    
      cudaEvent_t t_start, t_stop, k_start, k_stop;
      cudaEventCreate(&t_start);
      cudaEventCreate(&t_stop);
      cudaEventCreate(&k_start);
      cudaEventCreate(&k_stop);
      cudaCheckErrors("event fail");
    
      // first test original GPU version
    
      cudaEventRecord(t_start);
      cudaMemcpy(d_x, h_x, dsize * sizeof(real), cudaMemcpyHostToDevice);
      cudaCheckErrors("cuda memcpy 1 fail");
      cudaMemcpy(d_y, h_y, dsize * sizeof(real), cudaMemcpyHostToDevice);
      cudaCheckErrors("cuda memcpy 2 fail");
      cudaMemcpy(d_z, h_z, dsize * sizeof(real), cudaMemcpyHostToDevice);
      cudaCheckErrors("cuda memcpy 3 fail");
    
    
      cudaEventRecord(k_start);
      vectorStepAdd2(d_x, d_y, d_z, alpha, beta, wsize, xstep, ystep, zstep);
      cudaEventRecord(k_stop);
    
      cudaMemcpy(h_x, d_x, dsize * sizeof(real), cudaMemcpyDeviceToHost);
      cudaCheckErrors("cuda memcpy 4 fail");
      cudaEventRecord(t_stop);
      cudaEventSynchronize(t_stop);
      cudaEventElapsedTime(&et, t_start, t_stop);
      printf("GPU original version total elapsed time is: %f ms.\n", et);
      cudaEventElapsedTime(&et, k_start, k_stop);
      printf("GPU original kernel elapsed time is: %f ms.\n", et);
    
      //now test CPU version
    
      cudaEventRecord(t_start);
      vectorStepAdd3(c_x, h_y, h_z, alpha, beta, wsize, xstep, ystep, zstep);
      cudaEventRecord(t_stop);
      cudaEventSynchronize(t_stop);
      cudaEventElapsedTime(&et, t_start, t_stop);
      printf("CPU version total elapsed time is: %f ms.\n", et);
      for (int i = 0; i< dsize; i++)
        if (fabsf((float)(h_x[i]-c_x[i])) > TOL) {
          printf("cpu/gpu results mismatch at i = %d, cpu = %f, gpu = %f\n", i, c_x[i], h_x[i]);
          return 1;
          }
    
    
      // now test improved GPU version
    
      cudaEventRecord(t_start);
    //  cudaMemcpy2D(d_x1, sizeof(real),  h_x, xstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
    //  cudaCheckErrors("cuda memcpy 5 fail");
      cudaMemcpy2D(d_y1, sizeof(real),  h_y, ystep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
      cudaCheckErrors("cuda memcpy 6 fail");
      cudaMemcpy2D(d_z1, sizeof(real),  h_z, zstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
      cudaCheckErrors("cuda memcpy 7 fail");
    
      cudaEventRecord(k_start);
      vectorStepAdd2i(d_x1, d_y1, d_z1, alpha, beta, wsize);
      cudaEventRecord(k_stop);
    
      cudaMemcpy2D(h_x1, xstep*sizeof(real), d_x1, sizeof(real), sizeof(real), wsize, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cuda memcpy 8 fail");
      cudaEventRecord(t_stop);
      cudaEventSynchronize(t_stop);
      cudaEventElapsedTime(&et, t_start, t_stop);
      printf("GPU improved version total elapsed time is: %f ms.\n", et);
      cudaEventElapsedTime(&et, k_start, k_stop);
      printf("GPU improved kernel elapsed time is: %f ms.\n", et);
    
      for (int i = 0; i< dsize; i++)
        if (fabsf((float)(h_x[i]-h_x1[i])) > TOL) {
          printf("gpu/gpu improved results mismatch at i = %d, gpu = %f, gpu imp = %f\n", i, h_x[i], h_x1[i]);
          return 1;
          }
    
      printf("Results:i   CPU     GPU     GPUi \n");
      for (int i = 0; i< 20*xstep; i+=xstep)
        printf("    %d         %f      %f     %f    %f    %f\n",i, c_x[i], h_x[i], h_x1[i]);
    
    
      return 0;
    }
    

    如上所述,我仍然无法击败CPU时间,我将其归因于我自己缺乏编码技能,或者这个操作从根本上说没有足够的计算复杂性在GPU上有趣。不过这里有一些样本结果:

    GPU original version total elapsed time is: 13.352256 ms.
    GPU original kernel elapsed time is: 0.195808 ms.
    CPU version total elapsed time is: 2.599584 ms.
    GPU improved version total elapsed time is: 4.228288 ms.
    GPU improved kernel elapsed time is: 0.027392 ms.
    Results:i   CPU     GPU     GPUi
        0         0.617285      0.617285     0.617285
        47         0.554522      0.554522     0.554522
        94         0.104245      0.104245     0.104245
    ....
    

    我们可以看到,与原始内核相比,改进的内核整体减少了大约3倍,几乎所有这些都是由于数据复制时间的减少。数据复制时间的减少是由于改进的2D memcpy,我们只需要复制我们实际使用的数据。 (没有页面锁定的内存,这些数据传输时间大约是两倍)。我们还可以看到内核计算时间比原始内核的CPU计算快10倍,比改进内核的CPU计算快约100倍。然而,考虑到数据传输时间,我们无法克服CPU速度。

    最后一条评论是cudaMemcpy2D操作的“成本”仍然相当高。对于向量大小减少100倍,我们只看到复制时间缩短了3倍。因此,跨步访问仍然使得使用GPU的成本相对较高。如果我们只是简单地传输50,000个连续元素的向量,我们预计复制时间几乎会线性减少100倍(与5000000个元素的原始复制向量相比)。这意味着复制时间不到1毫秒,我们的GPU版本将比CPU快,至少这个天真的单线程CPU代码。