Cuda Vector Addition会产生大量错误

时间:2016-07-08 05:21:06

标签: cuda gpgpu

我正在运行用cuda编写的向量添加代码。关于代码的一切都很好,但是如果我增加矢量大小就会出现问题。错误数量(CPU和GPU给出的结果差异)变得太大。我附上以下代码:

#include <stdio.h>
#include <stdlib.h>

#include "cuda_utils.h"

#include "timer.h"

/*
 * **CUDA KERNEL** 
 * 
 * Compute the sum of two vectors 
 *   C[i] = A[i] + B[i]
 * 
 */
__global__ void vecAdd(float* a, float* b, float* c) {

  /* Calculate index for this thread */
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  /* Compute the element of C */
  c[i] = a[i] + b[i];
}

void compute_vec_add(int N, float *a, float* b, float *c);

/*
 * 
 * Host code to drive the CUDA Kernel
 * 
 */
int main() {

  float *d_a, *d_b, *d_c;
  float *h_a, *h_b, *h_c, *h_temp;
  int i;
  int N = 1024 * 1024 * 512;

  struct stopwatch_t* timer = NULL;
  long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

  /* Setup timers */
  stopwatch_init();
  timer = stopwatch_create();

  /*
   Create the vectors
   */
  h_a = (float *) malloc(sizeof(float) * N);
  h_b = (float *) malloc(sizeof(float) * N);
  h_c = (float *) malloc(sizeof(float) * N);

  /*
   Set the initial values of h_a, h_b, and h_c
   */
  for (i = 0; i < N; i++) {
    h_a[i] = (float) (rand() % 100) / 10.0;
    h_b[i] = (float) (rand() % 100) / 10.0;
    h_c[i] = 0.0;
  }

  /*
   Allocate space on the GPU
   */
  CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
  CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
  CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));

  /*
   Copy d_a and d_b from CPU to GPU
   */
  stopwatch_start(timer);
  CUDA_CHECK_ERROR(
      cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
  CUDA_CHECK_ERROR(
      cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
  t_pcie_htd = stopwatch_stop(timer);
  fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",
          t_pcie_htd);

  /*
   Run N/256 blocks of 256 threads each
   */
  dim3 GS(N / 256, 1, 1);
  dim3 BS(256, 1, 1);

  stopwatch_start(timer);
  vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
  cudaThreadSynchronize();
  t_kernel = stopwatch_stop(timer);
  fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);

  /*
   Copy d_cfrom GPU to CPU
   */
  stopwatch_start(timer);
  CUDA_CHECK_ERROR(
      cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
  t_pcie_dth = stopwatch_stop(timer);
  fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",
          t_pcie_dth);

  /* 
   Double check errors
   */
  h_temp = (float *) malloc(sizeof(float) * N);
  stopwatch_start(timer);
  compute_vec_add(N, h_a, h_b, h_temp);
  t_cpu = stopwatch_stop(timer);
  fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);

  int cnt = 0;
  for (int i = 0; i < N; i++) {
    if (abs(h_temp[i] - h_c[i]) > 1e-5)
      cnt++;
  }
  fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

  /*
   Free the device memory
   */
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  /*
   Free the host memory
   */
  free(h_a);
  free(h_b);
  free(h_c);

  /* 
   Free timer 
   */
  stopwatch_destroy(timer);

  if (cnt == 0) {
    printf("\n\nSuccess\n");
  }
}

void compute_vec_add(int N, float *a, float* b, float *c) {
  int i;
  for (i = 0; i < N; i++)
    c[i] = a[i] + b[i];
}

编辑:这就是我编译的方式

nvcc vecAdd.cu timer.o

当我们在GTX TITAN X上运行时,上面代码的输出如下:

Timer: gettimeofday
Timer resolution: ~ 1 us (?)
Time to transfer data from host to device: 1.44104 secs
Time to execute GPU kernel: 0.000121 secs
Time to transfer data from device to host: 0.725893 secs
Time to execute CPU program: 2.96071 secs
number of errors: 350576933 out of 536870912

此外,尽管CPU和GPU之间的高带宽连接,为什么从设备到主机传输大约2GB的数据需要0.72秒,或者从主机到设备传输~4GB的数据需要1.44秒。 谢谢。

1 个答案:

答案 0 :(得分:2)

总而言之,这里存在许多问题:

  1. 您正在编译默认架构(sm_20),它将您的内核网格限制为x维度中的65535个块。在大型数组中,您要求网格大小太大而内核永远不会运行。
  2. 修复此问题如下:

    nvcc -arch=sm_52 vecAdd.cu timer.o
    
    1. 您没有在内核启动时进行任何错误检查,因此您没有检测到内核启动失败。
    2. 修复此问题如下:

      vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
      CUDA_CHECK_ERROR(cudaPeekAtLastError());
      CUDA_CHECK_ERROR(cudaDeviceSynchronize());
      
      1. 在大问题维度上,用于计算内存分配大小的signed int可能会溢出,从而导致未定义的结果。您应该使用size_t代替。
      2. 修复此问题如下:

        size_t N = .....;
        size_t sz = N * sizeof(float);
        CUDA_CHECK_ERROR(cudaMalloc(&d_a, sz));
        // etc