Question

真的很感谢这段代码的一些帮助。我一直在努力创建自己的还原内核，我知道它们已经存在，所以这只是为了我自己的利益。我做了最大缩减，但是当我修改它以计算总和时，我得到了不同的结果。 cuda-memcheck给了我一个10兆字节的文件，里面充满了错误和警告，所以我认为我有一个竞争条件，我只是看不到。这是代码段。

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>                      //display 2 decimal places
#include <math.h>
#include <ctime>                        //For timers

using namespace std;

#define NUMBLOCKSPERGRID 1                              //Universal number of blocks per grid to use.
#define NUMTHREADSPERBLOCK 256                          //Universal number of threads per block.
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID   //Use multiple of 256
#define NUMPOINTS 256                               //Number of timesteps to integrate.
double concStorage[NUMPOINTS][MAXLENGTH] = {};          //Stores concs [rows] vs. time [columns]

__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;    //Number of threads per block
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;       //Number of blocks per grid
__device__ __constant__ int maxlength = MAXLENGTH;              //The largest polymer species we track.


//Temporary device arrays to use during integration.  Will not send back data.
__device__ double temp4[MAXLENGTH];

__global__ void rkf5(size_t, double*, double*, double*, double*);
__device__ void arrSum2(double*, double*);
__global__ void arrSumKernel(double*, double*, int);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    //Error checking
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__global__ void arrInit(double* a, double b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b;
}
__global__ void rkf5(size_t pitch, double* concStorage, double* concs, double* dt, double* d_ts)
{
    arrInit<<< numBlocks, numThreads >>>(temp4, 1);  //works
    cudaDeviceSynchronize();
    double p = 0;
    arrSum2(temp4, &p);
    cudaDeviceSynchronize();
    arrInit<<< numBlocks, numThreads >>>(d_ts, p);
    cudaDeviceSynchronize();
}

__device__ void arrSum2(double* arr, double* sumVal )
{
    /*
    Description : Sums all elements of array.
    Status : Untested.
    */

    int maxThreads = 1024;  //This can be reduced, but this is a max.  Make reducible for optimization.
    int blocks = int(maxlength/maxThreads)+1;   //works

    double* kernelSums= new double[blocks];
    double* blockSums= new double[1];

    arrInit<<< 1, blocks >>>(kernelSums, 0);    //works
    arrInit<<< 1, 1 >>>(blockSums, 0);  //works
    cudaDeviceSynchronize();

    arrSumKernel<<< blocks, maxThreads, maxThreads*sizeof(double) >>>(arr, kernelSums, maxlength);
    cudaDeviceSynchronize();
    arrSumKernel<<< 1, blocks, blocks*sizeof(double) >>>(kernelSums, blockSums, blocks);
    cudaDeviceSynchronize();
    *sumVal = blockSums[0];

}
__global__ void arrSumKernel(double* arr, double* sums, int length)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    extern __shared__ double blockMemory[];

    //This component will move the global array component to shared memory if its index < length, else pad with 0.
    if (idx < length)
        blockMemory[threadIdx.x] = arr[idx];
    else
        blockMemory[threadIdx.x] = 0;

    __syncthreads();

    int stage = 0;
    int maxStage = static_cast<int>(logf(blockDim.x)/logf(2));  //logf needed for CUDA

    while (stage <= maxStage)
    {
        int left = threadIdx.x;         
        int right = (threadIdx.x) + powf(2, (stage));       //idx+1 if idx is even, 0 if odd

        if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
            blockMemory[left] += blockMemory[right];        //Move larger value left.

        stage++;
        __syncthreads();
    }

    sums[blockIdx.x] = blockMemory[0];
    __syncthreads();
}

int main(int argc, char** argv)
{
    //Main program.
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
    cudaSetDevice(0);
    std::cout << std::fixed;                //Displays 2 decimal places.
    std::cout << std::setprecision(16);     //Displays 2 decimal places.

    const int numpoints = NUMPOINTS;                    //Number of timesteps to take.
    const int maxlength = MAXLENGTH;                    //Number of discrete concentrations we are tracking.
    double mo = 5E-6;                                   //Initial concentration in M.
    double to = 0;                                      //Beginning integration time in seconds.
    double tf = 7200;                                   //Final integration time in seconds.
    double dt = (tf-to)/static_cast<double>(numpoints); //Static step size in seconds.

    double concs[maxlength] = {};               //Meant to store the initial concentrations .
    double ts[numpoints]= {};                   //Can hold all the times at which concentrations are stored.

    //Initialize all the arrays on the host to ensure arrays of 0's are sent to the device.
    //Also, here is where we can seed the system.
    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";

    concs[0]=mo;
    std::cout<<"\n";


    //Define all the pointers to device array memory addresses. These contain the on-GPU
    //addresses of all the data we're generating/using.
    double *d_concStorage;          //On GPU 2D array that stores all the concentrations and associated times.
    double *d_concs;                //The concentrations for a specific timestep.
    double *d_dt;                   //The length of the timestep.
    double *d_ts;

    //Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU.
    size_t size_concs = sizeof(concs);
    size_t size_dt = sizeof(dt);
    size_t size_ts = sizeof(ts);
    size_t h_pitch = maxlength*sizeof(double);
    size_t d_pitch;

    //Calculate the "pitch" of the 2D array.  The pitch is basically the length of a 2D array's row.  IT's larger 
    //than the actual row full of data due to hadware issues.  We thusly will use the pitch instead of the data 
    //size to traverse the array.
    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints)); 

    //Allocate memory on the GPU for all the arrrays we're going to use in the integrator.
    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
    gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
    gpuErrchk(cudaMalloc((void**)&d_ts, size_ts));

    //Copy all initial values of arrays to GPU.
    gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ts, &ts, size_ts, cudaMemcpyHostToDevice));

    //Run the integrator.
    std::clock_t start;
    double duration;
    start = std::clock();
    rkf5<<<1,1>>>(d_pitch, d_concStorage, d_concs, d_dt, d_ts);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    duration = (std::clock() - start)/ (double) CLOCKS_PER_SEC;
    std::cout<<"printf: "<< duration <<'\n';

    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyDeviceToHost) );   
    gpuErrchk( cudaMemcpy(&ts, d_ts, size_ts, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();

    for (int i=0; i < maxlength; i++)
    {
        std::cout << " ";
        std::cout << ts[0];
    }



    cudaDeviceReset();  //Clean up all memory.
    return 0;
}

导致这些船载：

========= CUDA-MEMCHECK
========= ERROR: Potential RAW hazard detected at __shared__ 0x1007 in block (0, 0, 0) :
=========     Write Thread (512, 0, 0) at 0x000030c8 in C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/arrayFunctions.cu:209:arrSumKernel(double*, double*, int)
=========     Read Thread (0, 0, 0) at 0x00003038 in C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/arrayFunctions.cu:209:arrSumKernel(double*, double*, int)
=========     Current Value : 0
=========
========= ERROR: Potential RAW hazard detected at __shared__ 0x1006 in block (0, 0, 0) :
=========     Write Thread (512, 0, 0) at 0x000030c8 in C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/arrayFunctions.cu:209:arrSumKernel(double*, double*, int)
=========     Read Thread (0, 0, 0) at 0x00003038 in C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/arrayFunctions.cu:209:arrSumKernel(double*, double*, int)
=========     Current Value : 0

奇怪的是，如果我通过cuda-memcheck --tools racecheck运行它，给出这些参数的结果应该给我“256”是正确的。如果我只是运行应用程序，结果会有所不同。

Answer 1

我如何在这里制造RAW危险？

让我们来看看这个循环：

int stage = 0;
int maxStage = static_cast<int>(logf(blockDim.x)/logf(2));  //logf needed for CUDA

while (stage <= maxStage)
{
    int left = threadIdx.x;         
    int right = (threadIdx.x) + powf(2, (stage));       //idx+1 if idx is even, 0 if odd

    if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
        blockMemory[left] += blockMemory[right];        //Move larger value left.

    stage++;
    __syncthreads();
}

sums[blockIdx.x] = blockMemory[0];

在第一次通过while循环时，stage = 0. left = threadIdx.x和right = threadIdx.x + 1。所有线程，但最后一个线程通过条件if测试的第一部分：（right < blockDim.x）和所有线程通过测试的第二部分：（{{1} }），因为任何整数％1 == 0。

因此，这行代码：

left % int(powf(2, stage)) == 0

执行块中的每个线程（最后一个除外）。但这不正确，因为现在结果将取决于warp执行的顺序（如果warp 1执行在warp 0之前，warp 0 thread 31的结果将不同于warp 0在warp 1）之前执行的结果，这将构成竞争条件或RAW危险。

如果我按如下方式修改条件测试：

    blockMemory[left] += blockMemory[right];

然后我得到一致的可重复结果（256）。

我不是说代码在那时没有错误，但我认为这应该指向你修复它的方向。这不是我如何做parallel reduction。当然，应该避免使用超越性（if (( right < blockDim.x ) && ( left % int(powf(2, stage+1)) == 0 ))，powf）来表现，对于2的权力来说这并不难。

我如何在这里制造RAW危险？

1 个答案: