如何通过缩减在CUDA中找到数组的总和

时间:2016-07-29 09:04:08

标签: cuda reduction

我实现了一个函数,通过使用reduction来查找数组的总和,我的数组有32 * 32个元素,其值为0 ... 1023。 我的预期总和值是523776,但我的结果是15872,这是错误的。 这是我的代码:

#include <stdio.h>
#include <cuda.h>

#define w 32
#define h 32
#define N w*h

__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);

int main( void ) {
    int a[N], b[N]; // copies of a, b, c
    int *dev_a, *dev_b; // device copies of a, b, c
    int size = N * sizeof( int ); // we need space for 512 integers

    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );

    fill_array( a, N );

    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );

    dim3 blocksize(16,16);
    dim3 gridsize;

    gridsize.x=(w+blocksize.x-1)/blocksize.x;
    gridsize.y=(h+blocksize.y-1)/blocksize.y;

    reduce<<<gridsize, blocksize>>>(dev_a, dev_b);

    // copy device result back to host copy of c
    cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );

    printf("Reduced sum of Array elements = %d \n", b[0]);

    cudaFree( dev_a );
    cudaFree( dev_b );

    return 0;
}

__global__ void reduce(int *g_idata, int *g_odata) {

    __shared__ int sdata[256];

    // each thread loads one element from global to shared mem
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[threadIdx.x] = g_idata[i];

    __syncthreads();
    // do reduction in shared mem
    for (int s=1; s < blockDim.x; s *=2)
    {
        int index = 2 * s * threadIdx.x;;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(g_odata,sdata[0]);
}

// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
    for (int i = 0; i < n; i++)
        a[i] = i;
}

1 个答案:

答案 0 :(得分:3)

您的代码中至少存在2个问题

  1. 您正在对atomicAdd数组中的第一个元素执行dev_b,但您没有将该元素初始化为已知值(即0)。当然,在运行内核之前,您正在将b复制到dev_b,但由于您尚未将b初始化为任何已知值,因此无效。如果你想到的话,数组b不会在C或C ++中自动初始化为零。我们可以通过将b[0]设置为零来解决此问题,然后再将b复制到dev_b

  2. 编写缩减内核是为了处理1D情况(即使用的唯一线程索引是基于.x值的1D线程索引),但是您正在启动具有2D线程块和网格的内核。这种不匹配将无法正常工作,我们要么需要启动一维线程块和网格,要么重新编写内核以使用二维索引(即.x.y)。我选择了前者(1D)。

  3. 这是一个工作示例,对代码进行了这些更改,似乎产生了正确的结果:

    $ cat t1218.cu
    #include <stdio.h>
    
    #define w 32
    #define h 32
    #define N w*h
    
    __global__ void reduce(int *g_idata, int *g_odata);
    void fill_array (int *a, int n);
    
    int main( void ) {
        int a[N], b[N]; // copies of a, b, c
        int *dev_a, *dev_b; // device copies of a, b, c
        int size = N * sizeof( int ); // we need space for 512 integers
    
        // allocate device copies of a, b, c
        cudaMalloc( (void**)&dev_a, size );
        cudaMalloc( (void**)&dev_b, size );
    
        fill_array( a, N );
        b[0] = 0;  //initialize the first value of b to zero
        // copy inputs to device
        cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
        cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
    
        dim3 blocksize(256); // create 1D threadblock
        dim3 gridsize(N/blocksize.x);  //create 1D grid
    
        reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
    
        // copy device result back to host copy of c
        cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
    
        printf("Reduced sum of Array elements = %d \n", b[0]);
        printf("Value should be: %d \n", ((N-1)*(N/2)));
        cudaFree( dev_a );
        cudaFree( dev_b );
    
        return 0;
    }
    
    __global__ void reduce(int *g_idata, int *g_odata) {
    
        __shared__ int sdata[256];
    
        // each thread loads one element from global to shared mem
        // note use of 1D thread indices (only) in this kernel
        int i = blockIdx.x*blockDim.x + threadIdx.x;
    
        sdata[threadIdx.x] = g_idata[i];
    
        __syncthreads();
        // do reduction in shared mem
        for (int s=1; s < blockDim.x; s *=2)
        {
            int index = 2 * s * threadIdx.x;;
    
            if (index < blockDim.x)
            {
                sdata[index] += sdata[index + s];
            }
            __syncthreads();
        }
    
        // write result for this block to global mem
        if (threadIdx.x == 0)
            atomicAdd(g_odata,sdata[0]);
    }
    
    // CPU function to generate a vector of random integers
    void fill_array (int *a, int n)
    {
        for (int i = 0; i < n; i++)
            a[i] = i;
    }
    $ nvcc -o t1218 t1218.cu
    $ cuda-memcheck ./t1218
    ========= CUDA-MEMCHECK
    Reduced sum of Array elements = 523776
    Value should be: 523776
    ========= ERROR SUMMARY: 0 errors
    $
    

    注意:

    1. 内核和您编写的代码取决于N是线程块大小的精确倍数(256)。对于这种情况,这是令人满意的,但如果不是这样,事情就会破裂。

    2. 我没有看到proper cuda error checking的任何证据。它不会在这里发现任何东西,但它的良好做法。作为快速测试,请像我在此处一样使用cuda-memcheck运行您的代码。