我正在尝试在CPU和GPU上添加大型矢量的所有元素,并对结果进行基准测试。
我的CPU实现看起来像这样
void reductionCPU(float *result, float *input)
{
int i;
for (i = 0; i < SIZE; i++)
{
*result += input[i];
}
}
我的GPU内核是这样的:
__global__ void reductionKernel(float *result, float *input)
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;
if (index < SIZE)
{
atomicAdd(result, input[index]);
}
}
(下面的整个最小工作示例)
两者都很简单,但表现得有些奇怪。如果我让CPU和GPU只添加数字,结果总是匹配。输出是:
CPU时间:22.596495 ms,带宽:3.540372 GB / s
--- block_x:32,block_y:32,dim_x:100,dim_y:98
GPU时间:30.625248 ms,带宽:2.612224 GB / s CPU结果匹配 GPU导致天真的原子添加。 CPU:10000000.000000,GPU: 10000000.000000
但是,如果我想添加任意浮点数,结果永远不会匹配。
CPU时间:22.472712 ms,带宽:3.559873 GB / s
--- block_x:32,block_y:32,dim_x:100,dim_y:98
GPU时间:30.625153 ms,带宽:2.612232 GB / s CPU结果没有 匹配GPU结果天真的原子添加。 CPU:4996870656.000000,GPU: 4996921856.000000,差异:-51200.000000
更改添加到50之类的元素的数量会导致在某些运行中正确计算,而在其他运行中会导致错误计算。增加大小会导致错误计算的数量增加。
我认为这与浮点的精度有关,但这只是猜测。
如果我在0到10之间只添加数十个或随机的整个浮点数,则会出现同样的问题:
for (i = 0; i < SIZE; i++)
{
input[i] = floorf(((float)rand() / (float)(RAND_MAX)) * 10);
//input[i] = 10.0;
}
我在Windows 10上使用最新版本的Visual Studio进行开发。此外,我发现代码生成参数也会产生影响。我使用compute_30,sm_30
。用60替换30不能在我的GPU上运行,结果总是0.0。
如果缺少任何信息,请告诉我。
这是完整的最小工作代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
cudaError_t reductionWithCuda(float *result, float *input);
__global__ void reductionKernel(float *result, float *input);
void reductionCPU(float *result, float *input);
#define SIZE 10000000
//#define SIZE 50
#define BLOCK_X_NAIVE 32
#define BLOCK_Y_NAIVE 32
#define BLOCK_COUNT_X 100
int main()
{
int i;
float *input;
float resultCPU, resultGPU;
double cpuTime, cpuBandwidth;
input = (float*)malloc(SIZE * sizeof(float));
resultCPU = 0;
resultGPU = 0;
srand((int)time(NULL));
auto start = std::chrono::high_resolution_clock::now();
auto end = std::chrono::high_resolution_clock::now();
for (i = 0; i < SIZE; i++)
{
input[i] = ((float)rand() / (float)(RAND_MAX)) * 1000; // random floats between 0 and 1000
//input[i] = 1.0;
}
start = std::chrono::high_resolution_clock::now();
reductionCPU(&resultCPU, input);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000);
cpuBandwidth = (sizeof(float) * SIZE * 2) / (cpuTime * 1000000);
printf("CPU Time: %f ms, bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
reductionWithCuda(&resultGPU, input);
if (resultCPU != resultGPU)
printf("CPU result does not match GPU result in naive atomic add. CPU: %f, GPU: %f, diff:%f\n", resultCPU, resultGPU, (resultCPU - resultGPU));
else
printf("CPU result matches GPU result in naive atomic add. CPU: %f, GPU: %f\n", resultCPU, resultGPU);
cudaDeviceReset();
return 0;
}
void reductionCPU(float *result, float *input)
{
int i;
for (i = 0; i < SIZE; i++)
{
*result += input[i];
}
}
__global__ void reductionKernel(float *result, float *input)
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int index = row * BLOCK_X_NAIVE * BLOCK_COUNT_X + col;
if (index < SIZE)
{
atomicAdd(result, input[index]);
}
}
cudaError_t reductionWithCuda(float *result, float *input)
{
dim3 dim_grid, dim_block;
float *dev_input = 0;
float *dev_result = 0;
cudaError_t cudaStatus;
cudaEvent_t start, stop;
float elapsed = 0;
double gpuBandwidth;
dim_block.x = BLOCK_X_NAIVE;
dim_block.y = BLOCK_Y_NAIVE;
dim_block.z = 1;
dim_grid.x = BLOCK_COUNT_X;
dim_grid.y = (int)ceil((float)SIZE / (float)(BLOCK_X_NAIVE * BLOCK_Y_NAIVE * BLOCK_COUNT_X));
dim_grid.z = 1;
printf("\n---block_x:%d, block_y:%d, dim_x:%d, dim_y:%d\n", dim_block.x, dim_block.y, dim_grid.x, dim_grid.y);
cudaSetDevice(0);
cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
cudaMalloc((void**)&dev_result, sizeof(float));
cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
reductionKernel << <dim_grid, dim_block >> >(dev_result, dev_input);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
gpuBandwidth = (sizeof(float) * SIZE * 2) / (elapsed * 1000000);
printf("GPU Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
cudaDeviceSynchronize();
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_input);
cudaFree(dev_result);
return cudaStatus;
}