当我在我的ODE积分器上运行cuda-memcheck某个参数(时间步数,“NUMPOINTS”)小于某个值(10240)时,它运行正常。当我增加该值时,它会以“GPUassert:未指定的启动失败”终止。此外,当我在其上运行cuda-memcheck时,它似乎根本没有终止。我正在运行MSVS2012,Windows 7,GTX Titan,332.21驱动程序。
搜索周围,我发现http://www.linkedin.com/groups/Unspecified-launch-failure-How-can-1618517.S.69436913,这表明它是由以下两者引起的:
1。)Segfault
2。)内核“太长”
3.)尝试使用过多的共享内存
我不应该接近我的泰坦的全球内存上限,对吧?存储在GPU上的2D数组,我在其中存储我在积分器中计算的所有数据,大小为11264 x 8960,其中每个元素都是double。这将是100,925,440双打,总计807,403,520字节,或788MBytes,Titan有6GB。
如果我在无头GPU上进行集成并且已经运行了长度为〜1000s的内核,那么我的内核长度是否重要?
此参数与我对共享内存的使用无关。
编辑
这不是最小的,但在这里。
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
using namespace std;
#define NUMBLOCKSPERGRID 35
#define NUMTHREADSPERBLOCK 256
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID
#define NUMPOINTS 1024*11 //TROUBLEMAKER
double concStorage[NUMPOINTS][MAXLENGTH] = {};
__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;
__device__ __constant__ int numpoints = NUMPOINTS;
__device__ __constant__ int maxlength = MAXLENGTH;
__device__ __constant__ double localError = 1E-12;
__device__ __constant__ int nc = 2;
__device__ __constant__ int n2 = 0;
__device__ __constant__ double ka = 5E4;
__device__ __constant__ double kb = 0;
__device__ __constant__ double kp = 0;
__device__ __constant__ double km = 2E-8;
__device__ __constant__ double kn = 2E-5;
__device__ __constant__ double kn2 = 0;
__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrDiv(double*, double*, double*);
__global__ void arrAbs(double*);
__global__ void arrInit(double*, double);
__global__ void arrInitToLengths(double*);
__global__ void arrCopy(double*, double*);
__global__ void arrMaxKernel(double*, double*, double*, int);
__device__ double arrSum(double*);
__device__ void arrMultAddStore(double , double*, double*, double*, double*);
__device__ int arrLength(double*);
__device__ void arrMax(double*, double*, int*);
__device__ __constant__ double a21 = static_cast<double>(.25);
__device__ __constant__ double a31 = static_cast<double>(3)/static_cast<double>(32);
__device__ __constant__ double a32 = static_cast<double>(9)/static_cast<double>(32);
__device__ __constant__ double a41 = static_cast<double>(1932)/static_cast<double>(2197);
__device__ __constant__ double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
__device__ __constant__ double a43 = static_cast<double>(7296)/static_cast<double>(2197);
__device__ __constant__ double a51 = static_cast<double>(439)/static_cast<double>(216);
__device__ __constant__ double a52 = static_cast<double>(-8);
__device__ __constant__ double a53 = static_cast<double>(3680)/static_cast<double>(513);
__device__ __constant__ double a54 = static_cast<double>(-845)/static_cast<double>(4104);
__device__ __constant__ double a61 = static_cast<double>(-8)/static_cast<double>(27);
__device__ __constant__ double a62 = static_cast<double>(2);
__device__ __constant__ double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
__device__ __constant__ double a64 = static_cast<double>(1859)/static_cast<double>(4104);
__device__ __constant__ double a65 = static_cast<double>(-11)/static_cast<double>(40);
__device__ double temp1[MAXLENGTH];
__device__ double temp2[MAXLENGTH];
__device__ double temp3[MAXLENGTH];
__device__ double temp4[MAXLENGTH];
__device__ double tempsum[MAXLENGTH];
__device__ double k1s[MAXLENGTH];
__device__ double k2s[MAXLENGTH];
__device__ double k3s[MAXLENGTH];
__device__ double k4s[MAXLENGTH];
__device__ double k5s[MAXLENGTH];
__device__ double k6s[MAXLENGTH];
void printColumnText(string , double*, double [NUMPOINTS][MAXLENGTH]);
__global__ void rkf5(size_t, double*, double* , double*, double*);
__global__ void calcK(double*, double*, double*);
__device__ void calcKs(double*, double*);
__global__ void calcFlux(double*, double*, double*);
__device__ void calcMonomerFlux(double*, double*, double*);
__device__ void calcStepSize(double*, double*, double*, int*);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);
__device__ double flux(int, double*);
__device__ double knowles_flux(int, double*);
__device__ void zeroTemps();
__global__ void storeConcs(double*, size_t, double*, int);
__device__ void storeTime(double*, double, int);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
//Error checking
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv)
{
//Main program.
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
cudaSetDevice(0);
std::cout << std::fixed;
std::cout << std::setprecision(16);
const int numpoints = NUMPOINTS;
const int maxlength = MAXLENGTH;
double mo = 5E-6;
double to = 0;
double tf = 7200;
double dt = (tf-to)/static_cast<double>(numpoints);
string filename = "ItWorks.dat";
double concs[maxlength] = {};
double ts[numpoints]= {};
std::cout<<dt;
std::cout<<"\n";
concs[0]=mo;
std::cout<<concs[0];
std::cout<<" ";
concs[0]=mo;
std::cout<<"\n";
double *d_concStorage;
double *d_concs;
double *d_dt;
double *d_to;
double *d_tf;
double *d_ts;
size_t size_concs = sizeof(concs);
size_t size_dt = sizeof(dt);
size_t size_to = sizeof(to);
size_t size_tf = sizeof(tf);
size_t size_ts = sizeof(ts);
size_t h_pitch = maxlength*sizeof(double);
size_t d_pitch;
gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints));
gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
gpuErrchk(cudaMalloc((void**)&d_to, size_to));
gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));
gpuErrchk(cudaMalloc((void**)&d_ts, size_ts));
gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ts, &ts, size_ts, cudaMemcpyHostToDevice));
rkf5<<<1,1>>>(d_pitch, d_concStorage, d_concs, d_dt, d_ts);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
//Copy 2D array of concentrations vs. time from GPU to Host.
gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&ts, d_ts, size_ts, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
printColumnText(filename, ts, concStorage);
cudaDeviceReset();
return 0;
}
void printColumnText(string filename, double ts[NUMPOINTS], double concStorage[NUMPOINTS][MAXLENGTH])
{
ofstream myfile2;
myfile2.open (filename);
myfile2 << std::fixed << std::setprecision(16);
for(int j=0; j < NUMPOINTS; j++)
{
for (int i=0; i < (MAXLENGTH+1); i++)
{
if (i == 0)
{
myfile2 << std::fixed << std::setprecision(16) << ts[j];
//std::cout<<ts[j];
myfile2 << "\t";
}
else
{
myfile2 << std::fixed << std::setprecision(16) << concStorage[j][i-1];
//std::cout<<concStorage[j][i-1];
myfile2 << "\t";
}
}
myfile2 <<"\n";
}
myfile2.close();
}
__global__ void rkf5(size_t pitch, double* concStorage, double* concs, double* dt, double* d_ts)
{
zeroTemps();
double currentTime = 0; //This can be generalized for a different start time.
for(int k = 0; k < numpoints; k++)
{
double internalCounter = 0;
double error = localError + 1; //Ensure adaptive step size loop happens at least once per timestep.
int errorIdx = -1; //Used to do something.
zeroTemps();
storeConcs<<< numBlocks, numThreads >>>(concStorage, pitch, concs, k); //Store this step's concentrations in 2D array
cudaDeviceSynchronize();
while (error > localError)
{
internalCounter++;
calcKs(concs, dt);
cudaDeviceSynchronize();
calcStepSize(concs, dt, &error, &errorIdx); //temp1 = 4th Order guess, temp2 = 5th Order guess
cudaDeviceSynchronize();
if (error > localError)
{
//*dt = .5* (*dt);
*dt = pow((localError/error),(.2))*(*dt);
}
else if (error < localError)
{
//if (error < .75 * localError)
*dt = pow((localError/error),(.2))*(*dt);
//*dt = 1.25*(*dt);
}
//*/
}
currentTime += (*dt);
storeTime(d_ts, currentTime, k);
cudaDeviceSynchronize();
arrCopy<<< numBlocks, numThreads >>>(concs, temp2); //Probably not necessary if I find way to handle storing IC's better.
cudaDeviceSynchronize();
}
}
__device__ void calcStepSize(double* concs, double* dt, double* error, int* errorIdx)
{
takeFourthOrderStep<<< numBlocks, numThreads >>>(temp1, concs, k1s, k2s, k3s, k4s, k5s);
takeFifthOrderStep<<< numBlocks, numThreads >>>(temp2, concs, k1s, k2s, k3s, k4s, k5s, k6s);
cudaDeviceSynchronize();
arrSub<<< numBlocks, numThreads >>>(temp1, temp2, temp3);
cudaDeviceSynchronize();
arrAbs<<< numBlocks, numThreads >>>(temp3);
cudaDeviceSynchronize();
arrMax(temp3, error, errorIdx);
cudaDeviceSynchronize();
}
__device__ void calcKs(double* concs, double *dt)
{
zeroTemps();
calcFlux<<< numBlocks, numThreads >>>(concs, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k1s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps(); //temp1 = temp2 = tempsum = 0
arrMultAddStore(a21, temp1, tempsum, k1s, concs); //tempsum = a21*k1
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum); //tempsum = concs + a21*k1
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt); //temp2 = fluxes
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k2s, temp2, dt); //k2s = fluxes*dt
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a31, temp1, tempsum, k1s, concs);
arrMultAddStore(a32, temp1, tempsum, k2s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k3s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a41, temp1, tempsum, k1s, concs);
arrMultAddStore(a42, temp1, tempsum, k2s, concs);
arrMultAddStore(a43, temp1, tempsum, k3s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k4s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a51, temp1, tempsum, k1s, concs);
arrMultAddStore(a52, temp1, tempsum, k2s, concs);
arrMultAddStore(a53, temp1, tempsum, k3s, concs);
arrMultAddStore(a54, temp1, tempsum, k4s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k5s, temp2, dt); //k4s = fluxes*dt
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a61, temp1, tempsum, k1s, concs);
arrMultAddStore(a62, temp1, tempsum, k2s, concs);
arrMultAddStore(a63, temp1, tempsum, k3s, concs);
arrMultAddStore(a64, temp1, tempsum, k4s, concs);
arrMultAddStore(a65, temp1, tempsum, k5s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt); //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k6s, temp2, dt); //k4s = fluxes*dt
cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished
//At this point, temp1 and tempsum are maxlength dimension arrays that are able to be used for other things.
}
__global__ void takeFourthOrderStep(double* y4, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s)
{
//takeFourthOrderStep is going to overwrite the old temp1 array with the new array of concentrations that result from a 4th order step. This kernel is meant to be launched
// with as many threads as there are discrete concentrations to be tracked.
double b41 = static_cast<double>(25)/static_cast<double>(216);
double b42 = static_cast<double>(0);
double b43 = static_cast<double>(1408)/static_cast<double>(2565);
double b44 = static_cast<double>(2197)/static_cast<double>(4104);
double b45 = static_cast<double>(-1)/static_cast<double>(5);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
y4[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
__global__ void takeFifthOrderStep(double* y5, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s)
{
//takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations. As of now, this will be the 5th order step. Another function can be d
//defined that will take a fourth order step if that is interesting for any reason. This kernel is meant to be launched with as many threads as there are discrete concentrations
//to be tracked.
//Store b values in register? Constants?
double b51 = static_cast<double>(16)/static_cast<double>(135);
double b52 = static_cast<double>(0);
double b53 = static_cast<double>(6656)/static_cast<double>(12825);
double b54 = static_cast<double>(28561)/static_cast<double>(56430);
double b55 = static_cast<double>(-9)/static_cast<double>(50);
double b56 = static_cast<double>(2)/static_cast<double>(55);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
y5[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
__device__ void zeroTemps()
{
//Initializes all the temporary storage arrays to 0.
//Tested, works.
arrInit<<< numBlocks, numThreads >>>(temp1, 0);
arrInit<<< numBlocks, numThreads >>>(temp2, 0);
arrInit<<< numBlocks, numThreads >>>(temp3, 0);
arrInit<<< numBlocks, numThreads >>>(temp4, 0);
arrInit<<< numBlocks, numThreads >>>(tempsum, 0);
cudaDeviceSynchronize();
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
//cId = the row of cS I want to store concs in.
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
//int bIdx = blockIdx.x;
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
//cS is basically the memory address of the first element of the flattened (1D) 2D array.
double* row = (double*)((char*)cS + cId * pitch);
row[tIdx] = concs[tIdx];
}
__device__ void storeTime(double* timeArray, double timeValue, int k)
{
timeArray[k] = timeValue;
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
fluxes[idx]=knowles_flux(idx, concs);
//fluxes[idx]=flux(idx, concs);
}
__global__ void calcK(double* ks, double* fluxes, double* dt)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
ks[idx]=(*dt)*fluxes[idx];
}
//This function calculates separatemely the flux of the monomer species.
//Tested, works.
__device__ void calcMonomerFlux(double* fluxes, double* lengths, double* dt)
{
arrInitToLengths<<< numBlocks, numThreads >>>(lengths); //lengths = 1,2,3,4,5...maxlength
cudaDeviceSynchronize();
arrMult<<< numBlocks, numThreads >>>(fluxes, lengths, lengths); //lengths = r * fluxes[r]
cudaDeviceSynchronize();
fluxes[0]=-static_cast<double>(1)*arrSum(lengths); //fluxes[0] = -1*sum (r* fluxes[r])
}
//Placeholder function for the flux calculation. It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs)
{
return -concs[r];
}
//I need to use constants and replace these for loops with dynamic reductions.
__device__ double knowles_flux(int r, double *conc)
{
double frag_term = 0;
double flux = 0;
if (r == ((maxlength)-1))
{
flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
}
else if (r > ((nc)-1))
{
for (int s = r+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
}
else if (r == ((nc)-1))
{
for (int s = r+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
}
else if (r < ((nc)-1))
{
flux = 0;
}
return flux;
}
//Adds two arrays (a + b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]+b[idx];
}
//Subtracts two arrays (a - b) element by element and stores the result in array c.
__global__ void arrSub(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]-b[idx];
}
//Multiplies two arrays (a * b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]*b[idx];
}
//Divides two arrays (a / b) element by element and stores the result in array c.
__global__ void arrDiv(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]/b[idx];
}
__global__ void arrAbs(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx] = abs(a[idx]);
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b;
}
//Initializes an array a to the values of counting numbers. Tested, works.
__global__ void arrInitToLengths(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=idx+1;
}
//__global__ void arr2DInit(double* a, )
__global__ void arrReverseInitToLengths(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=1000-idx;
}
//Copies array b onto array a.
__global__ void arrCopy(double* a, double* b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b[idx];
}
//Sums all elements of array. Tested, works.
__device__ double arrSum(double* a)
{
double sum = 0;
for (int i = 0; i < maxlength; i++)
{
sum += a[i];
}
return sum;
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum. Used to
//add all the a*k terms. concs not necessary
//e.g. arrMultAddStore(a21, temp1, tempsum, k1s, concs, maxlength) => tempsum = a21 * k1
__device__ void arrMultAddStore(double tableauValue, double *temp1, double *tempsum, double *ks, double *concs)
{
//Sets tempsum to tabVal * k
arrInit<<< numBlocks, numThreads >>>(temp1, tableauValue); //Set [temp1] to tableau value, temp1 = a
cudaDeviceSynchronize();
arrMult<<< numBlocks, numThreads >>>(ks, temp1, temp1); //Multiply tableau value by appropriate [k], temp1 = a*k
cudaDeviceSynchronize();
arrAdd<<< numBlocks, numThreads >>>(tempsum, temp1, tempsum); //Move tabVal*k to [tempsum], tempsum = tempsum+temp1
cudaDeviceSynchronize();
//temp1 = tableauValue * kArray
//tempsum = current sum (tableauValue * kArray)
}
__device__ int arrLength(double* arr)
{
return sizeof(arr)/sizeof(arr[0]);
}
__device__ void arrMax(double* arr, double* maxVal, int* maxIdx )
{
//int maxIdxID = 0;
int maxThreads = 1024;
int blocks = int(maxlength/maxThreads)+1; //works
double* kernelMaxes= new double[blocks];
double* blockMaxes= new double[1];
double* kernelIdxs= new double[blocks];
double* blockIdxs= new double[1];
double* temp= new double[blocks];
arrInit<<< 1, blocks >>>(kernelMaxes, 0); //works
arrInit<<< 1, 1 >>>(blockMaxes, 0); //works
arrInitToLengths<<< 1, blocks >>>(kernelIdxs); //works
arrInit<<< 1, 1 >>>(blockIdxs, 0); //works
arrInit<<< 1, blocks >>>(temp, 1);
cudaDeviceSynchronize();
arrSub<<< 1, blocks >>>(kernelIdxs, temp, kernelIdxs); //kernel Idxs now initted to index
cudaDeviceSynchronize();
arrMaxKernel<<< blocks, maxThreads, maxThreads*sizeof(double) >>>(arr, kernelMaxes, kernelIdxs, maxlength);
cudaDeviceSynchronize();
arrMaxKernel<<< 1, blocks, blocks*sizeof(double) >>>(kernelMaxes, blockMaxes, blockIdxs, blocks);
cudaDeviceSynchronize();
*maxVal = blockMaxes[0];
*maxIdx = blockIdxs[0];
}
__global__ void arrMaxKernel(double* arr, double* maxes, double* idxs, int length)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//int maxIdx = 0;
extern __shared__ double blockMemory[];
if (idx < length)
{
blockMemory[threadIdx.x] = arr[idx];
//blockMemory2[threadIdx.x] = idxs[idx];
}
else
{
blockMemory[threadIdx.x] = 0;
//blockMemory2[threadIdx.x] = -1;
}
__syncthreads();
int stage = 0;
int maxStage = static_cast<int>(logf(blockDim.x)/logf(2)); //logf needed for CUDA
while (stage <= maxStage)
{
int left = threadIdx.x;
int right = (threadIdx.x) + powf(2, (stage));
if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
{
if ( (blockMemory[right] > blockMemory[left]) )
{
blockMemory[left] = blockMemory[right];
//blockMemory2[left] = blockMemory2[right];
}
}
stage++;
__syncthreads();
}
maxes[blockIdx.x] = blockMemory[0];
//idxs[blockIdx.x] = blockMemory2[0];
}
修改
所以这就是13小时运行的CUDA memcheck告诉我的。所以现在我可以看到我的哪些功能是非法写的...但这几乎使我的问题更加神秘。我正在更改的参数仅修改2D全局数组的大小以及我的积分器迭代的次数。我现在想知道的是,如果我可以运行NSIGHT CUDA调试器,直到它找到错误然后我可以查看导致它的值?我启用了memcheck,所以我认为如果不再使用它将需要13小时,哈哈。
答案 0 :(得分:0)
如果有人遇到同样的问题,罗伯特克罗维拉确实是对的。你只需要更长时间地运行cuda-memcheck。它花了我13个小时,它确实帮助我找出了错误。