Question

我对CUDA编程非常陌生。目前，我在理解以下程序来计算两个向量的点积的行为时遇到困难。

点积内核dotProd计算每个元素的乘积，并将结果减少为长度为blockDim.x*gridDim.x的较短向量。然后将向量*out中的结果复制回Host以进一步缩减。

第二版dotProdWithSharedMem是从《 CUDA示例》 书中复制的，请参见here。

我的问题是：

当内核启动了足够多的线程（nThreadsPerBlock*nblocks >= vector_length）时，dotProd的结果与CPU计算的结果匹配，但是dotProdWithSharedMem的结果与两者不同。可能是什么原因？ $ dot_prod.o 17 512的可能输出：

    Number of threads per block : 256 
    Number of blocks in the grid: 512 
    Total number of threads     : 131072 
    Length of vectors           : 131072 

    GPU using registers: 9.6904191971, time consummed: 0.56154 ms
    GPU using shared   : 9.6906833649, time consummed: 0.04473 ms
    CPU result         : 9.6904191971, time consummed: 0.28504 ms

当内核没有足够的线程（nThreadsPerBlock*nblocks < vector_length）启动时，GPU结果似乎不太准确。但是，while循环应该可以解决此问题。我想循环中的寄存器变量temp可能会发生某些情况，否则结果应该与问题1相同。$ dot_prod.o 17 256的可能输出：

Number of threads per block : 256 
Number of blocks in the grid: 256 
Total number of threads     : 65536 
Length of vectors           : 131072 

GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared   : 9.6906604767, time consummed: 0.03530 ms
CPU result         : 9.6904191971, time consummed: 0.28404 ms

我不太了解cache中dotProdWithSharedMem的大小。为什么它不是线程总数nThreadsPerBlock之外的nThreadsPerBlock * nblocks个元素？我认为应该是temp个值的正确数目，对吗？

代码：

#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>


#define PI (float) 3.141592653589793

const size_t nThreadsPerBlock = 256;


static void HandleError(cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess) {
    printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


__global__ void dotProd(int length, float *u, float *v, float *out) {
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
    float temp = 0;

    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }
    out[tid_const] = temp;
}


__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
    __shared__ float cache[nThreadsPerBlock];
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned cid = threadIdx.x;

    float temp = 0;
    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }

    cache[cid] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while (i != 0) {
        if (cid < i) {
            cache[cid] += cache[cid + i];
        }
        __syncthreads();
        i /= 2;
    }

    if (cid == 0) {
        out[blockIdx.x] = cache[0];
    }
}


int main(int argc, char* argv[]) {

    size_t vec_len  = 1 << std::stoi(argv[1]);
    size_t size     = vec_len * sizeof(float);
    size_t nblocks  = std::stoi(argv[2]);
    size_t size_out   = nThreadsPerBlock*nblocks*sizeof(float);
    size_t size_out_2 = nblocks*sizeof(float);

    float *u     = (float *)malloc(size);
    float *v     = (float *)malloc(size);
    float *out   = (float *)malloc(size_out);
    float *out_2 = (float *)malloc(size_out_2);

    float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays

    float res_gpu = 0;
    float res_gpu_2 = 0;
    float res_cpu = 0;

    dim3 dimGrid(nblocks, 1, 1);
    dim3 dimBlocks(nThreadsPerBlock, 1, 1);

    // Initiate values
    for(size_t i=0; i<vec_len; ++i) {
        u[i] = std::sin(i*PI*1E-2);
        v[i] = std::cos(i*PI*1E-2);
    }

    HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
    HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
    HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );


    auto t1_gpu = std::chrono::system_clock::now();
    dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
        res_gpu += out[i];
    }


    auto t2_gpu = std::chrono::system_clock::now();
    // GPU version with shared memory
    dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nblocks; ++i) {
        res_gpu_2 += out_2[i];
    }
    auto t3_gpu = std::chrono::system_clock::now();


    // CPU version for result-check
    for(size_t i=0; i<vec_len; ++i) {
        res_cpu += u[i] * v[i];
    }
    auto t2_cpu = std::chrono::system_clock::now();


    double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
    double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
    double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();

    printf("Number of threads per block : %i \n", nThreadsPerBlock);
    printf("Number of blocks in the grid: %i \n", nblocks);
    printf("Total number of threads     : %i \n", nThreadsPerBlock*nblocks);
    printf("Length of vectors           : %i \n\n", vec_len);
    printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
    printf("GPU using shared   : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
    printf("CPU result         : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);

    cudaFree(dev_u);
    cudaFree(dev_v);
    cudaFree(dev_out);
    cudaFree(dev_out_2);
    free(u);
    free(v);
    free(out);
    free(out_2);

    return 0;
}

感谢您耐心阅读本篇长篇文章！任何帮助将不胜感激！

Niko

Answer 1

您正在探索float精度的极限以及与浮点运算顺序有关的变化。此处的实际“准确性”将取决于确切的数据和确切的操作顺序。不同的算法将具有不同的运算顺序，因此结果也不同。

您可能想阅读this paper。

您似乎要做出的一个假设是，CPU结果是准确的，对此假设没有任何依据。

如果我们将“准确性”定义为结果与数字正确结果之间的差（即“紧密度”），我怀疑共享内存结果是更准确的结果。

如果我们将您的代码转换为使用double类型而不是float类型，则会发现：

所有3种方法的结果都非常接近（在打印输出中相同）。
double结果与任何float情况都不匹配。
float情况下的共享内存结果实际上是最接近double情况下的结果。

这是一个测试案例，证明了这一点：

$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>

#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793

const size_t nThreadsPerBlock = 256;


static void HandleError(cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess) {
    printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
            file, line );
    exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
    ft temp = 0;

    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }
    out[tid_const] = temp;
}


__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
    __shared__ ft cache[nThreadsPerBlock];
    unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned cid = threadIdx.x;

    ft temp = 0;
    while (tid < length) {
        temp += u[tid] * v[tid];
        tid  += blockDim.x * gridDim.x;
    }

    cache[cid] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while (i != 0) {
        if (cid < i) {
            cache[cid] += cache[cid + i];
        }
        __syncthreads();
        i /= 2;
    }

    if (cid == 0) {
        out[blockIdx.x] = cache[0];
    }
}


int main(int argc, char* argv[]) {

    size_t vec_len  = 1 << std::stoi(argv[1]);
    size_t size     = vec_len * sizeof(ft);
    size_t nblocks  = std::stoi(argv[2]);
    size_t size_out   = nThreadsPerBlock*nblocks*sizeof(ft);
    size_t size_out_2 = nblocks*sizeof(ft);

    ft *u     = (ft *)malloc(size);
    ft *v     = (ft *)malloc(size);
    ft *out   = (ft *)malloc(size_out);
    ft *out_2 = (ft *)malloc(size_out_2);

    ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays

    ft res_gpu = 0;
    ft res_gpu_2 = 0;
    ft res_cpu = 0;

    dim3 dimGrid(nblocks, 1, 1);
    dim3 dimBlocks(nThreadsPerBlock, 1, 1);

    // Initiate values
    for(size_t i=0; i<vec_len; ++i) {
        u[i] = std::sin(i*PI*1E-2);
        v[i] = std::cos(i*PI*1E-2);
    }

    HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
    HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
    HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
    HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );


    auto t1_gpu = std::chrono::system_clock::now();
    dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
        res_gpu += out[i];
    }


    auto t2_gpu = std::chrono::system_clock::now();
    // GPU version with shared memory
    dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
    cudaDeviceSynchronize();
    HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
    // Reduction
    for(size_t i=0; i<nblocks; ++i) {
        res_gpu_2 += out_2[i];
    }
    auto t3_gpu = std::chrono::system_clock::now();


    // CPU version for result-check
    for(size_t i=0; i<vec_len; ++i) {
        res_cpu += u[i] * v[i];
    }
    auto t2_cpu = std::chrono::system_clock::now();


    double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
    double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
    double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();

    printf("Number of threads per block : %i \n", nThreadsPerBlock);
    printf("Number of blocks in the grid: %i \n", nblocks);
    printf("Total number of threads     : %i \n", nThreadsPerBlock*nblocks);
    printf("Length of vectors           : %i \n\n", vec_len);
    printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
    printf("GPU using shared   : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
    printf("CPU result         : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);

    cudaFree(dev_u);
    cudaFree(dev_v);
    cudaFree(dev_out);
    cudaFree(dev_out_2);
    free(u);
    free(v);
    free(out);
    free(out_2);

    return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads     : 131072
Length of vectors           : 131072

GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared   : 9.6906833649, time consummed: 0.04289 ms
CPU result         : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads     : 131072
Length of vectors           : 131072

GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared   : 9.6913433287, time consummed: 0.05032 ms
CPU result         : 9.6913433287, time consummed: 0.41275 ms
$

CUDA：通过点积示例了解循环中寄存器文件中变量的行为

1 个答案: