Question

当我在我的ODE积分器上运行cuda-memcheck某个参数（时间步数，“NUMPOINTS”）小于某个值（10240）时，它运行正常。当我增加该值时，它会以“GPUassert：未指定的启动失败”终止。此外，当我在其上运行cuda-memcheck时，它似乎根本没有终止。我正在运行MSVS2012，Windows 7，GTX Titan，332.21驱动程序。

搜索周围，我发现http://www.linkedin.com/groups/Unspecified-launch-failure-How-can-1618517.S.69436913，这表明它是由以下两者引起的：

1。）Segfault

2。）内核“太长”

3.）尝试使用过多的共享内存

我不应该接近我的泰坦的全球内存上限，对吧？存储在GPU上的2D数组，我在其中存储我在积分器中计算的所有数据，大小为11264 x 8960，其中每个元素都是double。这将是100,925,440双打，总计807,403,520字节，或788MBytes，Titan有6GB。

如果我在无头GPU上进行集成并且已经运行了长度为〜1000s的内核，那么我的内核长度是否重要？

此参数与我对共享内存的使用无关。

编辑

这不是最小的，但在这里。

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>                      
#include <math.h>
using namespace std;

#define NUMBLOCKSPERGRID 35                             
#define NUMTHREADSPERBLOCK 256                          
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID   
#define NUMPOINTS 1024*11                //TROUBLEMAKER
double concStorage[NUMPOINTS][MAXLENGTH] = {};  

__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;    
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;       
__device__ __constant__ int numpoints = NUMPOINTS;              
__device__ __constant__ int maxlength = MAXLENGTH;              
__device__ __constant__ double localError = 1E-12;              
__device__ __constant__ int nc = 2;                             
__device__ __constant__ int n2 = 0;                             
__device__ __constant__ double ka = 5E4;
__device__ __constant__ double kb = 0;
__device__ __constant__ double kp = 0;
__device__ __constant__ double km = 2E-8;
__device__ __constant__ double kn = 2E-5;
__device__ __constant__ double kn2 = 0;

__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrDiv(double*, double*, double*);
__global__ void arrAbs(double*);
__global__ void arrInit(double*, double);
__global__ void arrInitToLengths(double*);
__global__ void arrCopy(double*, double*);
__global__ void arrMaxKernel(double*, double*, double*, int);
__device__ double arrSum(double*);
__device__ void arrMultAddStore(double , double*, double*, double*, double*);
__device__ int arrLength(double*);
__device__ void arrMax(double*, double*, int*);

__device__ __constant__ double a21 = static_cast<double>(.25);
__device__ __constant__ double a31 = static_cast<double>(3)/static_cast<double>(32);
__device__ __constant__ double a32 = static_cast<double>(9)/static_cast<double>(32);
__device__ __constant__ double a41 = static_cast<double>(1932)/static_cast<double>(2197);
__device__ __constant__ double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
__device__ __constant__ double a43 = static_cast<double>(7296)/static_cast<double>(2197);
__device__ __constant__ double a51 = static_cast<double>(439)/static_cast<double>(216);
__device__ __constant__ double a52 = static_cast<double>(-8);
__device__ __constant__ double a53 = static_cast<double>(3680)/static_cast<double>(513);
__device__ __constant__ double a54 = static_cast<double>(-845)/static_cast<double>(4104);
__device__ __constant__ double a61 = static_cast<double>(-8)/static_cast<double>(27);
__device__ __constant__ double a62 = static_cast<double>(2);
__device__ __constant__ double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
__device__ __constant__ double a64 = static_cast<double>(1859)/static_cast<double>(4104);
__device__ __constant__ double a65 = static_cast<double>(-11)/static_cast<double>(40);

__device__ double temp1[MAXLENGTH];
__device__ double temp2[MAXLENGTH];
__device__ double temp3[MAXLENGTH];
__device__ double temp4[MAXLENGTH];
__device__ double tempsum[MAXLENGTH];
__device__ double k1s[MAXLENGTH];
__device__ double k2s[MAXLENGTH];
__device__ double k3s[MAXLENGTH];
__device__ double k4s[MAXLENGTH];
__device__ double k5s[MAXLENGTH];
__device__ double k6s[MAXLENGTH];

void printColumnText(string , double*, double [NUMPOINTS][MAXLENGTH]);
__global__ void rkf5(size_t, double*, double* , double*, double*);
__global__ void calcK(double*, double*, double*);
__device__ void calcKs(double*, double*);
__global__ void calcFlux(double*, double*, double*);
__device__ void calcMonomerFlux(double*, double*, double*);
__device__ void calcStepSize(double*, double*, double*, int*);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);
__device__ double flux(int, double*);
__device__ double knowles_flux(int, double*);
__device__ void zeroTemps();
__global__ void storeConcs(double*, size_t, double*, int);
__device__ void storeTime(double*, double, int);



#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    //Error checking
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
int main(int argc, char** argv)
{
    //Main program.
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
    cudaSetDevice(0);
    std::cout << std::fixed;
    std::cout << std::setprecision(16);

    const int numpoints = NUMPOINTS;
    const int maxlength = MAXLENGTH;
    double mo = 5E-6;
    double to = 0;
    double tf = 7200;
    double dt = (tf-to)/static_cast<double>(numpoints);
    string filename = "ItWorks.dat";

    double concs[maxlength] = {};
    double ts[numpoints]= {};

    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";

    concs[0]=mo;
    std::cout<<"\n";

    double *d_concStorage;
    double *d_concs;
    double *d_dt;
    double *d_to;
    double *d_tf;
    double *d_ts;

    size_t size_concs = sizeof(concs);
    size_t size_dt = sizeof(dt);
    size_t size_to = sizeof(to);
    size_t size_tf = sizeof(tf);
    size_t size_ts = sizeof(ts);
    size_t h_pitch = maxlength*sizeof(double);
    size_t d_pitch;

    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints)); 

    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
    gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
    gpuErrchk(cudaMalloc((void**)&d_to, size_to));
    gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));
    gpuErrchk(cudaMalloc((void**)&d_ts, size_ts));

    gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ts, &ts, size_ts, cudaMemcpyHostToDevice));

    rkf5<<<1,1>>>(d_pitch, d_concStorage, d_concs, d_dt, d_ts);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyDeviceToHost) );   
    gpuErrchk( cudaMemcpy(&ts, d_ts, size_ts, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();

    printColumnText(filename, ts, concStorage);

    cudaDeviceReset();
    return 0;
}
void printColumnText(string filename, double ts[NUMPOINTS], double concStorage[NUMPOINTS][MAXLENGTH])
{
    ofstream myfile2;
    myfile2.open (filename);
    myfile2 << std::fixed << std::setprecision(16);
    for(int j=0; j < NUMPOINTS; j++)
    {
        for (int i=0; i < (MAXLENGTH+1); i++)
        {
            if (i == 0)
            {
                myfile2 << std::fixed << std::setprecision(16) << ts[j];
                //std::cout<<ts[j];
                myfile2 << "\t";
            }
            else
            {
                myfile2 << std::fixed << std::setprecision(16) << concStorage[j][i-1];
                //std::cout<<concStorage[j][i-1];
                myfile2 << "\t";
            }
        }
        myfile2 <<"\n";
    }
    myfile2.close();

}
__global__ void rkf5(size_t pitch, double* concStorage, double* concs, double* dt, double* d_ts)
{
    zeroTemps();
    double currentTime = 0;         //This can be generalized for a different start time.

    for(int k = 0; k < numpoints; k++)
    {
        double internalCounter = 0;
        double error = localError + 1;  //Ensure adaptive step size loop happens at least once per timestep.
        int errorIdx = -1;              //Used to do something.
        zeroTemps();

        storeConcs<<< numBlocks, numThreads >>>(concStorage, pitch, concs, k);  //Store this step's concentrations in 2D array
        cudaDeviceSynchronize();

        while (error > localError)
        {
            internalCounter++;
            calcKs(concs, dt);
            cudaDeviceSynchronize();

            calcStepSize(concs, dt, &error, &errorIdx); //temp1 = 4th Order guess, temp2 = 5th Order guess
            cudaDeviceSynchronize();

            if (error > localError)
            {
                //*dt = .5* (*dt);
                *dt = pow((localError/error),(.2))*(*dt);
            }
            else if (error < localError)
            {
                //if (error < .75 * localError)
                    *dt = pow((localError/error),(.2))*(*dt);
                    //*dt = 1.25*(*dt); 
            }
            //*/
        }
        currentTime += (*dt);
        storeTime(d_ts, currentTime, k);

        cudaDeviceSynchronize();
        arrCopy<<< numBlocks, numThreads >>>(concs, temp2);  //Probably not necessary if I find way to handle storing IC's better.
        cudaDeviceSynchronize();
    }
}

__device__ void calcStepSize(double* concs, double* dt, double* error, int* errorIdx)
{
    takeFourthOrderStep<<< numBlocks, numThreads >>>(temp1, concs, k1s, k2s, k3s, k4s, k5s);
    takeFifthOrderStep<<< numBlocks, numThreads >>>(temp2, concs, k1s, k2s, k3s, k4s, k5s, k6s);
    cudaDeviceSynchronize();
    arrSub<<< numBlocks, numThreads >>>(temp1, temp2, temp3);
    cudaDeviceSynchronize();
    arrAbs<<< numBlocks, numThreads >>>(temp3);
    cudaDeviceSynchronize();
    arrMax(temp3, error, errorIdx);
    cudaDeviceSynchronize();
}
__device__ void calcKs(double* concs, double *dt)
{
    zeroTemps();
    calcFlux<<< numBlocks, numThreads >>>(concs, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k1s, temp2, dt);
    cudaDeviceSynchronize();

    zeroTemps();                                                        //temp1 = temp2 = tempsum = 0
    arrMultAddStore(a21, temp1, tempsum, k1s, concs);                                                   //tempsum = a21*k1
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);                                               //tempsum = concs + a21*k1    
    cudaDeviceSynchronize();
    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);      //temp2 = fluxes
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k2s, temp2, dt);                                                     //k2s = fluxes*dt
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a31, temp1, tempsum, k1s, concs);
    arrMultAddStore(a32, temp1, tempsum, k2s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k3s, temp2, dt);
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a41, temp1, tempsum, k1s, concs);
    arrMultAddStore(a42, temp1, tempsum, k2s, concs);
    arrMultAddStore(a43, temp1, tempsum, k3s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k4s, temp2, dt);
    cudaDeviceSynchronize(); 

    zeroTemps();
    arrMultAddStore(a51, temp1, tempsum, k1s, concs);
    arrMultAddStore(a52, temp1, tempsum, k2s, concs);
    arrMultAddStore(a53, temp1, tempsum, k3s, concs);
    arrMultAddStore(a54, temp1, tempsum, k4s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k5s, temp2, dt);                                                         //k4s = fluxes*dt
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a61, temp1, tempsum, k1s, concs);
    arrMultAddStore(a62, temp1, tempsum, k2s, concs);
    arrMultAddStore(a63, temp1, tempsum, k3s, concs);
    arrMultAddStore(a64, temp1, tempsum, k4s, concs);
    arrMultAddStore(a65, temp1, tempsum, k5s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);          //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k6s, temp2, dt);                                                         //k4s = fluxes*dt
    cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished
    //At this point, temp1 and tempsum are maxlength dimension arrays that are able to be used for other things.
}
__global__ void takeFourthOrderStep(double* y4, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s)
{
    //takeFourthOrderStep is going to overwrite the old temp1 array with the new array of concentrations that result from a 4th order step.  This kernel is meant to be launched 
    // with as many threads as there are discrete concentrations to be tracked.
    double b41 = static_cast<double>(25)/static_cast<double>(216);
    double b42 = static_cast<double>(0);
    double b43 = static_cast<double>(1408)/static_cast<double>(2565);
    double b44 = static_cast<double>(2197)/static_cast<double>(4104);
    double b45 = static_cast<double>(-1)/static_cast<double>(5);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y4[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
__global__ void takeFifthOrderStep(double* y5, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s)
{
    //takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations.  As of now, this will be the 5th order step.  Another function can be d
    //defined that will take a fourth order step if that is interesting for any reason.  This kernel is meant to be launched with as many threads as there are discrete concentrations
    //to be tracked.
    //Store b values in register? Constants?
    double b51 = static_cast<double>(16)/static_cast<double>(135);
    double b52 = static_cast<double>(0);
    double b53 = static_cast<double>(6656)/static_cast<double>(12825);
    double b54 = static_cast<double>(28561)/static_cast<double>(56430);
    double b55 = static_cast<double>(-9)/static_cast<double>(50);
    double b56 = static_cast<double>(2)/static_cast<double>(55);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y5[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
__device__ void zeroTemps()
{
    //Initializes all the temporary storage arrays to 0.
    //Tested, works.
    arrInit<<< numBlocks, numThreads >>>(temp1, 0);
    arrInit<<< numBlocks, numThreads >>>(temp2, 0);
    arrInit<<< numBlocks, numThreads >>>(temp3, 0);
    arrInit<<< numBlocks, numThreads >>>(temp4, 0);
    arrInit<<< numBlocks, numThreads >>>(tempsum, 0);
    cudaDeviceSynchronize();
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
//cId = the row of cS I want to store concs in.
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
    //int bIdx = blockIdx.x;
    int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
    //cS is basically the memory address of the first element of the flattened (1D) 2D array.
    double* row = (double*)((char*)cS + cId * pitch);
    row[tIdx] = concs[tIdx];
}
__device__ void storeTime(double* timeArray, double timeValue, int k)
{
    timeArray[k] = timeValue;
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    fluxes[idx]=knowles_flux(idx, concs);
    //fluxes[idx]=flux(idx, concs);
}
__global__ void calcK(double* ks, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    ks[idx]=(*dt)*fluxes[idx];
}
//This function calculates separatemely the flux of the monomer species.
//Tested, works.
__device__ void calcMonomerFlux(double* fluxes, double* lengths, double* dt)
{
    arrInitToLengths<<< numBlocks, numThreads >>>(lengths);         //lengths = 1,2,3,4,5...maxlength
    cudaDeviceSynchronize();
    arrMult<<< numBlocks, numThreads >>>(fluxes, lengths, lengths); //lengths = r * fluxes[r]
    cudaDeviceSynchronize();
    fluxes[0]=-static_cast<double>(1)*arrSum(lengths);              //fluxes[0] = -1*sum (r* fluxes[r])
}
//Placeholder function for the flux calculation.  It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs) 
{
    return -concs[r];
}
//I need to use constants and replace these for loops with dynamic reductions.
__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((maxlength)-1))
        {
        flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
        }
    else if (r > ((nc)-1))
        {
        for (int s = r+1; s < (maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
        }
    else if (r == ((nc)-1))
        {
        for (int s = r+1; s < (maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
        }
    else if (r < ((nc)-1))
        {
        flux = 0;
        }
    return flux;
}
//Adds two arrays (a + b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]+b[idx];
}
//Subtracts two arrays (a - b) element by element and stores the result in array c.
__global__ void arrSub(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]-b[idx];
}
//Multiplies two arrays (a * b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]*b[idx];
}
//Divides two arrays (a / b) element by element and stores the result in array c.
__global__ void arrDiv(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]/b[idx];
}
__global__ void arrAbs(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx] = abs(a[idx]);
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b;
}
//Initializes an array a to the values of counting numbers.  Tested, works.
__global__ void arrInitToLengths(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=idx+1;
}
//__global__ void arr2DInit(double* a, )
__global__ void arrReverseInitToLengths(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=1000-idx;
}
//Copies array b onto array a.
__global__ void arrCopy(double* a, double* b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b[idx];
}
//Sums all elements of array. Tested, works.
__device__ double arrSum(double* a)
{
    double sum = 0;
    for (int i = 0; i < maxlength; i++) 
    {
        sum += a[i];
    }
    return sum;
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum.  Used to
//add all the a*k terms. concs not necessary
//e.g. arrMultAddStore(a21, temp1, tempsum, k1s, concs, maxlength) => tempsum = a21 * k1
__device__ void arrMultAddStore(double tableauValue, double *temp1, double *tempsum, double *ks, double *concs) 
{
    //Sets tempsum to tabVal * k
    arrInit<<< numBlocks, numThreads >>>(temp1, tableauValue);      //Set [temp1] to tableau value, temp1 = a
    cudaDeviceSynchronize();
    arrMult<<< numBlocks, numThreads >>>(ks, temp1, temp1);         //Multiply tableau value by appropriate [k], temp1 = a*k
    cudaDeviceSynchronize();
    arrAdd<<< numBlocks, numThreads >>>(tempsum, temp1, tempsum);   //Move tabVal*k to [tempsum], tempsum = tempsum+temp1
    cudaDeviceSynchronize();
    //temp1 = tableauValue * kArray
    //tempsum = current sum (tableauValue * kArray)
}
__device__ int arrLength(double* arr)
{
    return sizeof(arr)/sizeof(arr[0]);
}
__device__ void arrMax(double* arr, double* maxVal, int* maxIdx )
{
    //int maxIdxID = 0;
    int maxThreads = 1024;
    int blocks = int(maxlength/maxThreads)+1;   //works

    double* kernelMaxes= new double[blocks];
    double* blockMaxes= new double[1];
    double* kernelIdxs= new double[blocks];
    double* blockIdxs= new double[1];
    double* temp= new double[blocks];

    arrInit<<< 1, blocks >>>(kernelMaxes, 0);   //works
    arrInit<<< 1, 1 >>>(blockMaxes, 0); //works
    arrInitToLengths<<< 1, blocks >>>(kernelIdxs);  //works
    arrInit<<< 1, 1 >>>(blockIdxs, 0);  //works
    arrInit<<< 1, blocks >>>(temp, 1);
    cudaDeviceSynchronize();
    arrSub<<< 1, blocks >>>(kernelIdxs, temp, kernelIdxs);  //kernel Idxs now initted to index
    cudaDeviceSynchronize();

    arrMaxKernel<<< blocks, maxThreads, maxThreads*sizeof(double) >>>(arr, kernelMaxes, kernelIdxs, maxlength);
    cudaDeviceSynchronize();
    arrMaxKernel<<< 1, blocks, blocks*sizeof(double) >>>(kernelMaxes, blockMaxes, blockIdxs, blocks);
    cudaDeviceSynchronize();
    *maxVal = blockMaxes[0];
    *maxIdx = blockIdxs[0];
}
__global__ void arrMaxKernel(double* arr, double* maxes, double* idxs, int length)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    //int maxIdx = 0;
    extern __shared__ double blockMemory[];

    if (idx < length)
    {
        blockMemory[threadIdx.x] = arr[idx];
        //blockMemory2[threadIdx.x] = idxs[idx];
    }
    else
    {
        blockMemory[threadIdx.x] = 0;
        //blockMemory2[threadIdx.x] = -1;
    }

    __syncthreads();

    int stage = 0;
    int maxStage = static_cast<int>(logf(blockDim.x)/logf(2));  //logf needed for CUDA

    while (stage <= maxStage)
    {
        int left = threadIdx.x;         
        int right = (threadIdx.x) + powf(2, (stage));

        if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
        {
            if ( (blockMemory[right] > blockMemory[left]) )
            {
                blockMemory[left] = blockMemory[right];
                //blockMemory2[left] = blockMemory2[right];
            }
        }
        stage++;
        __syncthreads();
    }

    maxes[blockIdx.x] = blockMemory[0];
    //idxs[blockIdx.x] = blockMemory2[0];
}

修改

所以这就是13小时运行的CUDA memcheck告诉我的。所以现在我可以看到我的哪些功能是非法写的...但这几乎使我的问题更加神秘。我正在更改的参数仅修改2D全局数组的大小以及我的积分器迭代的次数。我现在想知道的是，如果我可以运行NSIGHT CUDA调试器，直到它找到错误然后我可以查看导致它的值？我启用了memcheck，所以我认为如果不再使用它将需要13小时，哈哈。

cuda-memcheck output

Answer 1

如果有人遇到同样的问题，罗伯特克罗维拉确实是对的。你只需要更长时间地运行cuda-memcheck。它花了我13个小时，它确实帮助我找出了错误。

cuda-memcheck似乎没有终止

1 个答案: