Question

我已经构建了一个问题的最小示例，我将面对更大的代码示例。在这个例子中，我想找到一些数据ys到函数fs的平方和误差，但我想一次在多个函数上做，所以我创建了{{1}作为矩阵。原始数据的长度为fs，我希望一次对gridSize个函数执行此费用函数，因此nGrids的大小为fs。

我发现CUDA内核以不确定的方式给出了不可靠的结果，这使我相信我没有正确执行我的线程（这是我的第一个nGrids*gridSize内核！）。我在这个程序上运行了CUDA，它没有显示任何错误。

为了测试这些错误的零星性质，我编写了一个脚本来运行它100次并比较结果随机关闭的频率。我发现当cuda-memcheck增长时它被关闭的可能性更大：

gridSize

这里的想法是让每个块在一个网格上工作，并在我想要提升并行性时调用多个gridSize ... Errors 300 ... 0/100 400 ... 0/100 450 ... 4/100 500 ... 5/100 550 ... 55/100 600 ... 59/100 650 ... 100/100块。因此，我在这里打了12个街区，因为有12个网格。对于此代码，我永远不会有CUDA超过1000，因此我会将gridSize留在Nthreads（因为我的1024上每个块有1024个线程）

以下是代码：

NVIDIA GTX 770

如果重要，这是我硬件的规格：

#include <stdio.h>

#define nGrids 12
#define gridSize 700

void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs);
void D_get_costs(float* h_xs, float* h_ys, float* h_fs, float* d_costs);

/**************\
 * cuda Costs *
\**************/
__global__ void cuCosts(float* d_xs, float* d_ys, float* d_fs, float* d_costs) {
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
        __syncthreads();
        // reduction
        for(int s=1; s < blockDim.x; s*=2) {
            if( ir%(2*s) == 0 && ir+s < gridSize){
                diff[ir] += diff[ir+s];
            }
        }
        __syncthreads();
        d_costs[ig] = diff[0];
    }
    __syncthreads();
}


/****************\
 * Main routine *
\****************/
int main(int argc, char** argv) {

    float h_xs[gridSize];
    float h_ys[gridSize];
    float h_fs[gridSize*nGrids];

    for( int ir = 0; ir < gridSize; ir++) {
        h_xs[ir] = (float)ir/10.0;
        h_ys[ir] = (float)ir/10.0;
    }

    for(int ir = 0; ir < gridSize; ir++) {
        for(int jgrid = 0; jgrid < nGrids; jgrid++) {
            float trand = 2.0*((float)rand()/(float)RAND_MAX) - 1.0;
            h_fs[jgrid*gridSize + ir] = h_ys[ir] + trand;
        }
    }

    float h_costs[nGrids];
    float d_costs[nGrids];

    // get all of the costs (on the host)
    H_get_costs(h_xs, h_ys, h_fs, h_costs);

    // get all of the costs (on the device)
    D_get_costs(h_xs, h_ys, h_fs, d_costs);

    // Print the grids
    /*
    for(int ir = 0; ir < gridSize; ir++) {
        printf("%10.5e %15.5e", h_xs[ir], h_ys[ir]);
        for(int jg = 0; jg < nGrids; jg++) {
            printf("%15.5e", h_fs[jg*gridSize + ir]);
        }
        printf("\n");
    }
    */

   // print the results
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Host ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", h_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Device ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]);
    }
    printf("\n");
    printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
    printf("%-25s  ", "Difference ... ");
    for(int ig = 0; ig < nGrids; ig++) {
        printf("%15.5e", d_costs[ig]-h_costs[ig]);
    }
    printf("\n");

    return 0;
}

/*******************************\
 * get the costs (on the host) *
\*******************************/
void H_get_costs(float* h_xs, float* h_ys, float* h_fs, float* h_costs) {
    for(int ig = 0; ig < nGrids; ig++) { h_costs[ig] = 0.0; }
    for(int ir = 0; ir < gridSize-1; ir++) {
        for(int ig = 0; ig < nGrids; ig++) {
            h_costs[ig] += (h_ys[ir] - h_fs[ig*gridSize + ir])*(h_ys[ir] - h_fs[ig*gridSize + ir]);
        }
    }
}

/**************************\
 * wrapper for cuda costs *
\**************************/
void D_get_costs(float* h_xs_p, float* h_ys_p, float* h_fs_p, float* r_costs) {
    float* d_xs;
    float* d_ys;
    float* d_fs;

    float* d_costs; // device costs
    float* t_costs; // temporary costs

    cudaMalloc( (void**)&d_xs, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_ys, gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_fs, nGrids*gridSize*sizeof(float) );
    cudaMalloc( (void**)&d_costs, nGrids*sizeof(float) );

    t_costs = (float*)malloc(nGrids*sizeof(float));

    cudaMemcpy( d_xs, h_xs_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_ys, h_ys_p, gridSize*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( d_fs, h_fs_p, nGrids*gridSize*sizeof(float), cudaMemcpyHostToDevice);

    int Nthreads = 1024;
    int Nblocks = nGrids;

    cuCosts<<<Nblocks, Nthreads>>>(d_xs, d_ys, d_fs, d_costs);

    cudaMemcpy( t_costs, d_costs, nGrids*sizeof(float), cudaMemcpyDeviceToHost);

    for(int ig = 0; ig < nGrids; ig++) {
        r_costs[ig] = t_costs[ig];
    }

    cudaFree( d_xs );
    cudaFree( d_ys );
    cudaFree( d_fs );
}

Answer 1

您的内核代码存在多个导致问题的同步问题。例如，在__syncthreads()调用周围存在分支，这是CUDA中未定义的行为。然后，您在还原循环中缺少同步点，这意味着扭曲到扭曲累积是不正确的。像这样：

__global__ void cuCosts(float* d_xs, float* d_ys, 
                        float* d_fs, float* d_costs)
{
    int ir = threadIdx.x;
    int ig = blockIdx.x;

    __shared__ float diff[1024];

    diff[ir] = 0.0;
    __syncthreads();

    if( ir < gridSize-1 && ig < nGrids) {
        diff[ir] =   (d_ys[ir] - d_fs[ig*gridSize + ir])*(d_ys[ir] - d_fs[ig*gridSize + ir]);
    }
    __syncthreads();

    // reduction
    for(int s=1; s < blockDim.x; s*=2) {
        if( ir%(2*s) == 0 && ir+s < gridSize){
            diff[ir] += diff[ir+s];
        }
        __syncthreads();
    }
    d_costs[ig] = diff[0];
}

应该可以正常工作[免责声明，用浏览器编写，未经测试，使用风险自负]

为什么这个cuda内核会产生非确定性结果？

1 个答案: