为什么我在CUDA中实现了总和减少的结果会得到错误的结果?

时间:2018-01-13 18:17:13

标签: c++ cuda sum reduction

我正在研究使用CUDA C ++ API实现的vector_reduction算法的一个教程,我正在努力,因为我真的不明白我做错了什么因为结果是(设备:4386.000000主机:260795.000000)< / p>

我正在使用的代码如下(问题大小固定为512)。

编辑:不幸的是问题还没有解决,我仍然得到相同的结果。我已经更新了提供完整代码的代码。目标是相同的,总结512个元素的浮点数组的所有元素。

    #define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
        __shared__ float s_data[NUM_ELEMENTS];
      int tid = threadIdx.x;
      int index = tid + blockIdx.x*blockDim.x;
      s_data[tid] = 0.0;
      if (index < n){
        s_data[tid] = g_data[index];
      }
      __syncthreads();

      for (int s = 2; s <= blockDim.x; s = s * 2){
        if ((tid%s) == 0){
          s_data[tid] += s_data[tid + s / 2];
        }
        __syncthreads();
      }

      if (tid == 0){
        g_data[blockIdx.x] = s_data[tid];
      }
}


    // includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, kernels
#include "vector_reduction_kernel.cu"

// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);

float computeOnDevice(float* h_data, int array_mem_size);

extern "C" 
void computeGold( float* reference, float* idata, const unsigned int len);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{

cudaSetDevice(0);
    runTest( argc, argv);
    return EXIT_SUCCESS;
}

////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv) 
{
    int num_elements = NUM_ELEMENTS;

    const unsigned int array_mem_size = sizeof( float) * num_elements;

    // Allocate host memory to store the input data
    float* h_data = (float*) malloc( array_mem_size);

    // initialize the input data on the host to be integer values
    // between 0 and 1000
    for( unsigned int i = 0; i < num_elements; ++i) 
        h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));

    // Function to compute the reference solution on CPU using a C sequential version of the algorithm
    // It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
    float reference = 0.0f;  
    computeGold(&reference , h_data, num_elements);

    // Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
    // The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
    float result = computeOnDevice(h_data, num_elements);

    // We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
    float epsilon = 0.0f;
    unsigned int result_regtest = (abs(result - reference) <= epsilon);
    printf( "Test %s\n", (1 == result_regtest) ? "Ok." : "No.");
    printf( "device: %f  host: %f\n", result, reference);
    // cleanup memory
    free( h_data);
}

// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread 
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
float computeOnDevice(float* h_data, int num_elements)
{
  float* d_data = NULL;
  float result;

  // Memory allocation on device side
  cudaMalloc((void**)&d_data, sizeof(float)*num_elements);

  // Copy from host memory to device memory
  cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );

  //int threads = (num_elements/2) + num_elements%2;
  int threads = (num_elements);
  // Invoke the kernel
  reduction<<< 1 ,threads >>>(d_data,num_elements);

  // Copy from device memory back to host memory
  cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);

  cudaFree(d_data);
  cudaDeviceReset();
  return result;
}

float computeOnDevice(float* h_data, int num_elements)
    {
      float* d_data = NULL;
      float result;

      // Memory allocation on device side
      cudaMalloc((void**)&d_data, sizeof(float)*num_elements);

      // Copy from host memory to device memory
      cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );

      int threads = (num_elements);

      // Invoke the kernel
      reduction<<< 1 ,threads >>>(d_data,num_elements);

      // Copy from device memory back to host memory
      cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
      cudaFree(d_data);
      cudaDeviceReset();
      return result;
    }

1 个答案:

答案 0 :(得分:2)

你真的应该为这样的问题提供完整的代码。您还应该使用proper CUDA error checking并使用cuda-memcheck运行代码。您的代码中至少有2个错误:

  1. 我们不像这样cudaMemcpy

      cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
    

    它应该是:

      cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
    

    第一个参数只是一个指针,而不是一个指向指针的指针。 cuda-memcheck或正确的CUDA错误检查会将您的注意力集中在这一行上。

  2. 您没有启动足够的线程。你的内核为每个线程加载一个元素。如果您的问题大小为512,那么您将需要512个线程,并且:

      int threads = (num_elements/2) + num_elements%2;
    

    没有得到你。不知道你有什么想法。但这可以解决512案例:

      int threads = (num_elements);
    

    您的缩减方法需要两个线程块大小。

  3. 这是一个完整的测试用例,请注意使用cuda-memcheck

    $ cat t27.cu
    #include <stdio.h>
            #define NUM_ELEMENTS 512
        __global__ void reduction(float *g_data, int n)
        {
            __shared__ float s_data[NUM_ELEMENTS];
          int tid = threadIdx.x;
          int index = tid + blockIdx.x*blockDim.x;
          s_data[tid] = 0.0;
          if (index < n){
            s_data[tid] = g_data[index];
          }
          __syncthreads();
    
          for (int s = 2; s <= blockDim.x; s = s * 2){
            if ((tid%s) == 0){
              s_data[tid] += s_data[tid + s / 2];
            }
            __syncthreads();
          }
    
          if (tid == 0){
            g_data[blockIdx.x] = s_data[tid];
          }
        }
    
    float computeOnDevice(float* h_data, int num_elements)
        {
          float* d_data = NULL;
          float result;
    
          // Memory allocation on device side
          cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
    
          // Copy from host memory to device memory
          cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
    
          int threads = (num_elements);
    
          // Invoke the kernel
          reduction<<< 1 ,threads >>>(d_data,num_elements);
    
          // Copy from device memory back to host memory
          cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
          cudaFree(d_data);
          cudaDeviceReset();
          return result;
        }
    
    
    int main(){
    
       float *data = new float[NUM_ELEMENTS];
       for (int i = 0; i < NUM_ELEMENTS; i++) data[i] = 1;
       float r = computeOnDevice(data, NUM_ELEMENTS);
       printf(" result = %f\n" , r);
    }
    $ nvcc -arch=sm_35 -o t27 t27.cu
    $ cuda-memcheck ./t27
    ========= CUDA-MEMCHECK
     result = 512.000000
    ========= ERROR SUMMARY: 0 errors
    

    以下是您现在发布的代码的修改版本(以几种新的/不同的方式分解),这似乎对我来说正确运行:

    $ cat t30.cu
        #define NUM_ELEMENTS 512
    __global__ void reduction(float *g_data, int n)
    {
            __shared__ float s_data[NUM_ELEMENTS];
          int tid = threadIdx.x;
          int index = tid + blockIdx.x*blockDim.x;
          s_data[tid] = 0.0;
          if (index < n){
            s_data[tid] = g_data[index];
          }
          __syncthreads();
    
          for (int s = 2; s <= blockDim.x; s = s * 2){
            if ((tid%s) == 0){
              s_data[tid] += s_data[tid + s / 2];
            }
            __syncthreads();
          }
    
          if (tid == 0){
            g_data[blockIdx.x] = s_data[tid];
          }
    }
    
    
        // includes, system
    #include <cuda_runtime.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <float.h>
    
    // includes, kernels
    
    // For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
    #define NUM_ELEMENTS 512
    
    ////////////////////////////////////////////////////////////////////////////////
    // declaration, forward
    void runTest( int argc, char** argv);
    
    float computeOnDevice(float* h_data, int array_mem_size);
    
    extern "C"
    void computeGold( float* reference, float* idata, const unsigned int len)
    {
      for (int i = 0; i<len; i++) *reference += idata[i];
    };
    
    ////////////////////////////////////////////////////////////////////////////////
    // Program main
    ////////////////////////////////////////////////////////////////////////////////
    int main( int argc, char** argv)
    {
    
    cudaSetDevice(0);
        runTest( argc, argv);
        return EXIT_SUCCESS;
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    //! Run naive scan test
    ////////////////////////////////////////////////////////////////////////////////
    void runTest( int argc, char** argv)
    {
        int num_elements = NUM_ELEMENTS;
    
        const unsigned int array_mem_size = sizeof( float) * num_elements;
    
        // Allocate host memory to store the input data
        float* h_data = (float*) malloc( array_mem_size);
    
        // initialize the input data on the host to be integer values
        // between 0 and 1000
        for( unsigned int i = 0; i < num_elements; ++i)
            h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
    
        // Function to compute the reference solution on CPU using a C sequential version of the algorithm
        // It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
        float reference = 0.0f;
        computeGold(&reference , h_data, num_elements);
    
        // Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
        // The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
        float result = computeOnDevice(h_data, num_elements);
    
        // We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
        float epsilon = 0.0f;
        unsigned int result_regtest = (abs(result - reference) <= epsilon);
        printf( "Test %s\n", (1 == result_regtest) ? "CORRECTO: Coinciden los resultados de la CPU y la GPU" : "INCORRECTO: Los resultados calculados en paralelo en la GPU no coinciden con los obtenidos secuencialmente en la CPU");
        printf( "device: %f  host: %f\n", result, reference);
        // cleanup memory
        free( h_data);
    }
    
    // Function to call the CUDA kernel on the GPU.
    // Take h_data from host, copies it to device, setup grid and thread
    // dimensions, excutes kernel function, and copy result of scan back
    // to h_data.
    // Note: float* h_data is both the input and the output of this function.
    #if 0
    float computeOnDevice(float* h_data, int num_elements)
    {
      float* d_data = NULL;
      float result;
    
      // Memory allocation on device side
      cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
    
      // Copy from host memory to device memory
      cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
    
      //int threads = (num_elements/2) + num_elements%2;
      int threads = (num_elements);
      // Invoke the kernel
      reduction<<< 1 ,threads >>>(d_data,num_elements);
    
      // Copy from device memory back to host memory
      cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
    
      cudaFree(d_data);
      cudaDeviceReset();
      return result;
    }
    #endif
    float computeOnDevice(float* h_data, int num_elements)
        {
          float* d_data = NULL;
          float result;
    
          // Memory allocation on device side
          cudaError_t err = cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
          if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); exit(0);}
          // Copy from host memory to device memory
          cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
    
          int threads = (num_elements);
    
          // Invoke the kernel
          reduction<<< 1 ,threads >>>(d_data,num_elements);
    
          // Copy from device memory back to host memory
          cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
          cudaFree(d_data);
          err = cudaGetLastError();
          if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); exit(0);}
          cudaDeviceReset();
          return result;
        }
    $ nvcc -arch=sm_35 -o t30 t30.cu
    $ cuda-memcheck ./t30
    ========= CUDA-MEMCHECK
    Test CORRECTO: Coinciden los resultados de la CPU y la GPU
    device: 260795.000000  host: 260795.000000
    ========= ERROR SUMMARY: 0 errors
    $
    

    您仍然没有在代码中添加正确的CUDA错误检查,因此您完全有可能遇到机器设置问题。如果您仍然遇到问题,可能需要运行我上面发布的确切代码,因为我已经在其中进行了基本的错误检查。