CUDA atomicAdd with floats

时间:2017-01-10 15:46:16

标签: c++ cuda floating-point precision atomic

我正在尝试在CPU和GPU上添加大型矢量的所有元素,并对结果进行基准测试。

我的CPU实现看起来像这样

void reductionCPU(float *result, float *input)
{
    int i;

    for (i = 0; i < SIZE; i++)
    {
        *result += input[i];
    }
}

我的GPU内核是这样的:

__global__ void reductionKernel(float *result, float *input)
{
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;

    if (index < SIZE)
    {
        atomicAdd(result, input[index]);
    }
}

(下面的整个最小工作示例)

两者都很简单,但表现得有些奇怪。如果我让CPU和GPU只添加数字,结果总是匹配。输出是:

  

CPU时间:22.596495 ms,带宽:3.540372 GB / s

     

--- block_x:32,block_y:32,dim_x:100,dim_y:98

     

GPU时间:30.625248 ms,带宽:2.612224 GB / s CPU结果匹配   GPU导致天真的原子添加。 CPU:10000000.000000,GPU:   10000000.000000

但是,如果我想添加任意浮点数,结果永远不会匹配。

  

CPU时间:22.472712 ms,带宽:3.559873 GB / s

     

--- block_x:32,block_y:32,dim_x:100,dim_y:98

     

GPU时间:30.625153 ms,带宽:2.612232 GB / s CPU结果没有   匹配GPU结果天真的原子添加。 CPU:4996870656.000000,GPU:   4996921856.000000,差异:-51200.000000

更改添加到50之类的元素的数量会导致在某些运行中正确计算,而在其他运行中会导致错误计算。增加大小会导致错误计算的数量增加。

我认为这与浮点的精度有关,但这只是猜测。

如果我在0到10之间只添加数十个或随机的整个浮点数,则会出现同样的问题:

for (i = 0; i < SIZE; i++)
{
    input[i] = floorf(((float)rand() / (float)(RAND_MAX)) * 10);
    //input[i] = 10.0;
}

我在Windows 10上使用最新版本的Visual Studio进行开发。此外,我发现代码生成参数也会产生影响。我使用compute_30,sm_30。用60替换30不能在我的GPU上运行,结果总是0.0。

如果缺少任何信息,请告诉我。

这是完整的最小工作代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

cudaError_t reductionWithCuda(float *result, float *input);
__global__ void reductionKernel(float *result, float *input);
void reductionCPU(float *result, float *input);

#define SIZE 10000000
//#define SIZE 50

#define BLOCK_X_NAIVE 32
#define BLOCK_Y_NAIVE 32
#define BLOCK_COUNT_X 100

int main()
{
    int i;
    float *input;
    float resultCPU, resultGPU;
    double cpuTime, cpuBandwidth;

    input = (float*)malloc(SIZE * sizeof(float));
    resultCPU = 0;
    resultGPU = 0;

    srand((int)time(NULL));

    auto start = std::chrono::high_resolution_clock::now();
    auto end = std::chrono::high_resolution_clock::now();

    for (i = 0; i < SIZE; i++)
    {
        input[i] = ((float)rand() / (float)(RAND_MAX)) * 1000; // random floats between 0 and 1000
        //input[i] = 1.0;
    }

    start = std::chrono::high_resolution_clock::now();
    reductionCPU(&resultCPU, input);
    end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000);
    cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
    printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    reductionWithCuda(&resultGPU, input);

    if (resultCPU != resultGPU)
        printf("CPU result does not match GPU result in naive atomic add. CPU: %f, GPU: %f, diff:%f\n", resultCPU, resultGPU, (resultCPU - resultGPU));
    else
        printf("CPU result matches GPU result in naive atomic add. CPU: %f, GPU: %f\n", resultCPU, resultGPU);

    cudaDeviceReset();

    return 0;
}

void reductionCPU(float *result, float *input)
{
    int i;

    for (i = 0; i < SIZE; i++)
    {
        *result += input[i];
    }
}

__global__ void reductionKernel(float *result, float *input)
{
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;

    if (index < SIZE)
    {
        atomicAdd(result, input[index]);
    }
}

cudaError_t reductionWithCuda(float *result, float *input)
{
    dim3 dim_grid, dim_block;

    float *dev_input = 0;
    float *dev_result = 0;
    cudaError_t cudaStatus;
    cudaEvent_t start, stop;
    float elapsed = 0;
    double gpuBandwidth;

    dim_block.x = BLOCK_X_NAIVE;
    dim_block.y = BLOCK_Y_NAIVE;
    dim_block.z = 1;

    dim_grid.x = BLOCK_COUNT_X;
    dim_grid.y = (int)ceil((float)SIZE / (float)(BLOCK_X_NAIVE * BLOCK_Y_NAIVE * BLOCK_COUNT_X));
    dim_grid.z = 1;

    printf("\n---block_x:%d, block_y:%d, dim_x:%d, dim_y:%d\n", dim_block.x, dim_block.y, dim_grid.x, dim_grid.y);

    cudaSetDevice(0);
    cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
    cudaMalloc((void**)&dev_result, sizeof(float));
    cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionKernel << <dim_grid, dim_block >> >(dev_result, dev_input);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsed, start, stop);

    gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
    printf("GPU Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

    cudaDeviceSynchronize();
    cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(dev_input);
    cudaFree(dev_result);

    return cudaStatus;
}

0 个答案:

没有答案