我对CUDA编程非常陌生。目前,我在理解以下程序来计算两个向量的点积的行为时遇到困难。
点积内核dotProd
计算每个元素的乘积,并将结果减少为长度为blockDim.x*gridDim.x
的较短向量。然后将向量*out
中的结果复制回Host以进一步缩减。
第二版dotProdWithSharedMem
是从《 CUDA示例》 书中复制的,请参见here。
我的问题是:
nThreadsPerBlock*nblocks >= vector_length
)时,dotProd
的结果与CPU计算的结果匹配,但是dotProdWithSharedMem
的结果与两者不同。可能是什么原因? $ dot_prod.o 17 512
的可能输出: Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.56154 ms
GPU using shared : 9.6906833649, time consummed: 0.04473 ms
CPU result : 9.6904191971, time consummed: 0.28504 ms
nThreadsPerBlock*nblocks < vector_length
)启动时,GPU结果似乎不太准确。但是,while
循环应该可以解决此问题。我想循环中的寄存器变量temp
可能会发生某些情况,否则结果应该与问题1相同。$ dot_prod.o 17 256
的可能输出:Number of threads per block : 256
Number of blocks in the grid: 256
Total number of threads : 65536
Length of vectors : 131072
GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared : 9.6906604767, time consummed: 0.03530 ms
CPU result : 9.6904191971, time consummed: 0.28404 ms
cache
中dotProdWithSharedMem
的大小。为什么它不是线程总数nThreadsPerBlock
之外的nThreadsPerBlock * nblocks
个元素?我认为应该是temp
个值的正确数目,对吗?代码:
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#define PI (float) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, float *u, float *v, float *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
__shared__ float cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(float);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(float);
size_t size_out_2 = nblocks*sizeof(float);
float *u = (float *)malloc(size);
float *v = (float *)malloc(size);
float *out = (float *)malloc(size_out);
float *out_2 = (float *)malloc(size_out_2);
float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
float res_gpu = 0;
float res_gpu_2 = 0;
float res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
感谢您耐心阅读本篇长篇文章!任何帮助将不胜感激!
Niko
答案 0 :(得分:3)
您正在探索float
精度的极限以及与浮点运算顺序有关的变化。此处的实际“准确性”将取决于确切的数据和确切的操作顺序。不同的算法将具有不同的运算顺序,因此结果也不同。
您可能想阅读this paper。
您似乎要做出的一个假设是,CPU结果是准确的,对此假设没有任何依据。
如果我们将“准确性”定义为结果与数字正确结果之间的差(即“紧密度”),我怀疑共享内存结果是更准确的结果。
如果我们将您的代码转换为使用double
类型而不是float
类型,则会发现:
double
结果与任何float
情况都不匹配。float
情况下的共享内存结果实际上是最接近double
情况下的结果。这是一个测试案例,证明了这一点:
$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
__shared__ ft cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(ft);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(ft);
size_t size_out_2 = nblocks*sizeof(ft);
ft *u = (ft *)malloc(size);
ft *v = (ft *)malloc(size);
ft *out = (ft *)malloc(size_out);
ft *out_2 = (ft *)malloc(size_out_2);
ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
ft res_gpu = 0;
ft res_gpu_2 = 0;
ft res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared : 9.6906833649, time consummed: 0.04289 ms
CPU result : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared : 9.6913433287, time consummed: 0.05032 ms
CPU result : 9.6913433287, time consummed: 0.41275 ms
$