GPU似乎随机返回NaN

时间:2014-01-03 19:30:01

标签: cuda gpu differential-equations

我被困在这一段时间,并希望有人可以解决这个问题。

在GTX690上运行内核时,偶尔会将NaN作为其中一个值返回并破坏模拟的其余部分。此外,每次运行模拟时,NaN值在不同位置和不同时间看似随机出现。在Geforce 630M芯片上也会出现此问题,因此我认为卡不是问题所在。

当我重写内核以双精度工作时,它似乎使问题发生的次数减少了,但并没有完全解决它。我的猜测是在我不知道的内核执行过程中发生了一些事情,导致了这个问题。我的代码如下。对此的任何帮助将不胜感激。

#include <iostream>
#include <fstream>
#include <cmath>
#include <cuda.h>
#include <math.h>
#include <time.h>


__global__ void getvstate (float *States,  float *Vstate, int *mySimsize) {
int Simsize = mySimsize[0];
int idx= Simsize*blockIdx.x +threadIdx.x;
Vstate[idx] = States[13*idx];
}


__global__ void Evaluate_Functions (float *States,  int *mySimsize) {

int Simsize = mySimsize[0];
int idx= Simsize*blockIdx.x +threadIdx.x;

double conn = 2;
double dx = 1.0;
double dy = dx;
double dt = .1;




double V = (double)States[idx*13 + 0];
double Cai = (double)States[idx*13 + 1];
double Casr = (double)States[idx*13 + 2];
double f = (double)States[idx*13 + 3];
double d = (double)States[idx*13 + 4];
double m = (double)States[idx*13 + 5];
double h = (double)States[idx*13 + 6];
double j = (double)States[idx*13 + 7];
double fca = (double)States[idx*13 + 8];
double Xkr = (double)States[idx*13 + 9];
double Xks = (double)States[idx*13 + 10];
double Xto = (double)States[idx*13 + 11];
double Yto = (double)States[idx*13 + 12];


////Constants///////////
double Gna = 12.8;
double Gk1 = 2.8;
double Gkr = 0.0136;
double Gks = 0.0245;
double Gkp = 0.002216;
double Gto = 0.23815;
double Gnab = 0.0031;
double Gcab = 0.0003842;
double Pca = 0.0000226;
double Pcak = 5.97e-7;
double Prel = 6;
double Pleak = 0.000001;
double Ibarnak = 0.693;
double Icahalf = -0.265;
double Ibarpca = 0.05;
double R = 8.314;
double T = 310;
double F = 96.5;
double Acap = 1.534e-4;
double Csc = 1;
double nu = .35;
double ksat = 0.2;
double knaca = 1500;
double Kmfca = 0.18;
double Kmk1 = 13;
double Kmna = 87.5;
double Kmca = 1380;
double Kmnai= 10;
double Kmko = 1.5;
double Kmpca = 0.05;
double Kmup = 0.32;
double CMDN = 10;
double CSQN = 10000;
double Kcmdn = 2;
double Kcsqn = 600;
double Vup = 0.1;
double Vmyo = 2.584e-5;
double Vsr = 2e-6;
double Nai = 10;
double Ki = 149.4;
double Nao = 138;
double Ko = 4;
double Cao = 2000;

//sigmoids
double alpha_m = .32*(V+47.13)/(1-exp(-.1*(V+47.13)));
double beta_m = 0.08*exp(-V/11);
double alpha_h = 0.135*exp((V+80)/(-6.8));
double beta_h = 7.5/(1+exp(-.1*(V+11)));
double alpha_j = (0.175*exp((V+100)/(-23)))/(1+exp(.15*(V+79)));
double beta_j = 0.3/(1+exp(-.1*(V+32)));

double Ena = (R*T/F)*log(Nao/Nai);
double Ina = Gna*m*m*m*h*j*(V-Ena);

double Ek = (R*T/F)*log(Ko/Ki);
double Kinf = 1/(2+exp(1.62*(F/(R*T))*(V-Ek)));
double Ikl = Gk1*Kinf*(Ko/(Ko+Kmk1))*(V-Ek);

double Rv = 1/(1+2.5*exp(.1*(V+28)));
double taukr = 43 + 1/(exp(-5.495 + .1691*V) + exp(-7.677-0.0128*V));
double Xkrinf = 1/(1+exp(-2.182-0.1819*V));
double Ikr = Gkr*Rv*Xkr*sqrt(Ko/4)*(V-Ek);

double tauks = 1/((.0000719*(V-10)/(1-exp(-.148*(V-10)))) + (.000131*(V-10)/(exp(.0687*(V-10))-1)));
double Xksinf = 1/(1+exp((V-16)/(-13.6)));
double Eks = (R*T/F)*log((Ko + 0.01833*Nao)/(Ki + 0.01833*Nai));
double Iks = Gks*Xks*Xks*(V-Eks);

double alpha_xto = 0.04516*exp(0.03577*V);
double beta_xto = 0.0989*exp(-0.06237*V);
double alpha_yto = (0.005415*exp((V+33.5)/(-5)))/(1+0.051335*exp((V+33.5)/(-5)));
double beta_yto =  (0.005415*exp((V+33.5)/(5)))/(1+0.051335*exp((V+33.5)/(5)));
double Ito = Gto*Xto*Yto*(V-Ek);

double Kkp = 1/(1+exp((7.488-V)/(5.98)));
double Ikp = Gkp*Kkp*(V-Ek);

double sigma = (1/7)*(exp(Nao/67.3)-1);
double fnak = 1/(1+.1245*exp(-.1*(V*F)/(R*T)) + 0.0365*sigma*exp(-V*F/(R*T)));
double Inak = Ibarnak*fnak*(1/(1+   sqrt((Kmnai/Nai)*(Kmnai/Nai)*(Kmnai/Nai))   ))*(Ko/(Ko+Kmko));

double Eca = ((R*T)/(2*F))*log(Cao/Cai);
double Icab = Gcab*(V-Eca);
double Ipca = (Ibarpca*Cai)/(Kmpca+Cai);
double Inaca = (knaca/(   Kmna*Kmna*Kmna    +   Nao*Nao*Nao   )) * (1/(Kmca + Cao)) * (1/(1+ksat*exp(V*F*(nu-1)/(R*T)))) * (exp(V*F*nu/(R*T))*Nai*Nai*Nai*Cao-exp(V*F*(nu-1)/(R*T))*Nao*Nao*Nao*Cai);
double Inab = Gnab*(V-Ena);

double finf = 1/(1+exp((V+12.5)/5));
double tauf = 30 + 200/(1 + exp((V+20)/9.5));
double dinf = 1/(1+exp((V+10)/-6.24));
double taud = 1 / (   (.25*exp(-.01*V))/(1 + exp(-.07*V)) + (0.07*exp(-0.05*(V+40)))/(1+exp(.05*(V+40))));
double fcainf = 1/(1 + (Cai/Kmfca)*(Cai/Kmfca)*(Cai/Kmfca)   );
double taufca = 30;

double Ibarca = (Pca/Csc)*(4*V*F*F/(R*T))*(-.341*Cao+Cai*exp(2*V*F/(R*T)))/(exp(2*V*F/(R*T))-1);
double Ica = Ibarca*f*d*fca;

double Icak = (Pcak/Csc)*(f*d*fca/(1+(Ibarca/Icahalf)))*(1000*V*F*F/(R*T))*(Ki*exp(V*F/(R*T))-Ko)/(exp(V*F/(R*T))-1);

double Betasr = 1/(1 + ((CSQN*Kcsqn)/(   (Kcsqn+Casr)* (Kcsqn+Casr)   )));
double Jleak = Pleak*(Casr - Cai);
double Jup = Vup/(1 +   (Kmup/Cai)*(Kmup/Cai)    );
double gamma = 1/(1 + (2000/Casr)* (2000/Casr)*(2000/Casr)    );
double Jrel = Prel*f*d*fca*(gamma*Casr-Cai)/(1 + 1.65*exp(V/20));
double Betai = 1/(1 + (CMDN*Kcmdn)/(    (Kcmdn+Cai)*(Kcmdn+Cai)    ));

double currents = -(Ina + Ikl + Ikr + Iks + Ito + Ikp + Inak + Inaca + Inab + Icab + Ipca + Ica + Icak);

int yy = blockIdx.x;
int xx = threadIdx.x;
int Vplusidx   = 13*(Simsize*min(yy+1,Simsize-1) +threadIdx.x);
int Vminusidx  = 13*(Simsize*max(0,yy-1) + threadIdx.x);
int Vrightidx = 13*(Simsize*blockIdx.x + min(Simsize-1,xx + 1));
int Vleftidx = 13*(Simsize*blockIdx.x + max(0,xx-1));

__syncthreads();
//Update states

float Vbuf;

////////////////////RHS Function Evaluation
Vbuf = (float)((1-(4*dt*conn)/(2*dx*dx))*V + (dt*conn)/(2*dx*dx)*(States[Vplusidx]+States[Vminusidx]+States[Vleftidx] + States[Vrightidx]) + dt*currents/2);
//////////////////////


States[13*idx + 1] = (float)(.5*dt*Betai*(Jrel + Jleak - Jup - ((Acap*Csc)/(2*F*Vmyo))*(Ica+Icab+Ipca-2*Inaca)) + Cai);
States[13*idx + 2] = (float)(.5*dt*Betasr*(Jup-Jleak-Jrel)*(Vmyo/Vsr) + Casr);
States[13*idx + 3] =  (float)((f-finf)*exp(-(.5*dt)/tauf) + finf);
States[13*idx + 4] =  (float)((d-dinf)*exp(-(.5*dt)/taud) + dinf);

double y6inf = alpha_m/(alpha_m + beta_m);
States[13*idx + 5] = (float)(y6inf-(y6inf-m)*exp(-(.5*dt)*(alpha_m+beta_m)));

double y7inf = alpha_h/(alpha_h + beta_h);
States[13*idx + 6] = (float)(y7inf-(y7inf-h)*exp(-(.5*dt)*(alpha_h+beta_h)));

double y8inf = alpha_j/(alpha_j + beta_j);
States[13*idx + 7] = (float)(y8inf-(y8inf-j)*exp(-(.5*dt)*(alpha_j+beta_j)));

States[13*idx + 8] = (float)((fca-fcainf)*exp(-(.5*dt)/taufca) + fcainf);
States[13*idx + 9] = (float)((Xkr-Xkrinf)*exp(-(.5*dt)/taukr) + Xkrinf);
States[13*idx + 10] = (float)((Xks-Xksinf)*exp(-(.5*dt)/tauks) + Xksinf);

double y12inf = alpha_xto/(alpha_xto + beta_xto);
States[13*idx + 11] = (float)(y12inf-(y12inf-Xto)*exp(-(.5*dt)*(alpha_xto+beta_xto)));

double y13inf = alpha_yto/(alpha_yto + beta_yto);
States[13*idx + 12] = (float)(y13inf-(y13inf-Yto)*exp(-(.5*dt)*(alpha_yto+beta_yto)));
__syncthreads();
States[13*idx] = Vbuf;

}


void main(){


int myco = 0;
int const steptot =20000;
clock_t t1,t2;
t1=clock();
int mycount = 0;
int count2 = 0;
cudaSetDevice(1);
FILE * States = fopen("C:\\Users\\ddwilson\\Desktop\\Isostable_Improvement\\foxstates.dat", "wb");

const int gridsize = 256;


float *h_states = new float[13*gridsize*gridsize];
float *h_Vstates = new float[gridsize*gridsize];

FILE * myinits = fopen("myinits.bin","rb");
float h_myinits[13];
fread(h_myinits,sizeof(float),13,myinits);

for(int i = 0;i<gridsize/2;i++){
    for(int j = 0; j<gridsize/2;j++){
        for(int k = 0;k<13;k++){
        h_states[13*(j+gridsize*i) + k] = h_myinits[k];
        }
    }
}

fread(h_myinits,sizeof(float),13,myinits);
for(int i = gridsize/2;i<gridsize;i++){
    for(int j = 0; j<gridsize/2;j++){
        for(int k = 0;k<13;k++){
        h_states[13*(j+gridsize*i) + k] = h_myinits[k];
        }
    }
}


fread(h_myinits,sizeof(float),13,myinits);
for(int i = gridsize/2;i<gridsize;i++){
    for(int j = gridsize/2; j<gridsize;j++){
        for(int k = 0;k<13;k++){
        h_states[13*(j+gridsize*i) + k] = h_myinits[k];
        }
    }
}

fread(h_myinits,sizeof(float),13,myinits);
//h_myinits[0] = 10;
for(int i = 0;i<gridsize/2;i++){
    for(int j = gridsize/2; j<gridsize;j++){
        for(int k = 0;k<13;k++){
        h_states[13*(j+gridsize*i) + k] = h_myinits[k];
        }
    }
}


float *d_ucontrol;
float *d_states;
float *d_Vstates;
int *d_Simsize;
int cudagrid[1];
cudagrid[0] = gridsize;


cudaMalloc( (void**) &d_states, 13*gridsize*gridsize*sizeof(float) );
cudaMalloc( (void**) &d_Vstates, gridsize*gridsize*sizeof(float) );
cudaMalloc( (void**) &d_Simsize,sizeof(const int) );

cudaMemcpy(d_states,h_states,13*gridsize*gridsize*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_Simsize,cudagrid,sizeof(int),cudaMemcpyHostToDevice);




for(int m = 0;m<steptot;m++){

//Evaluate_Functions<<<gridsize,gridsize>>>(d_states,d_Simsize);
Evaluate_Functions<<<gridsize,gridsize>>>(d_states,d_Simsize);
cudaDeviceSynchronize();

/*if (m==9500){
FILE * statetemp = fopen("initstate.bin","wb");
fwrite(h_Estate,sizeof(float),gridsize*gridsize,statetemp);
fwrite(h_nstate,sizeof(float),gridsize*gridsize,statetemp);
fclose(statetemp);
}*/

if (myco == 1){

getvstate <<<gridsize,gridsize>>>(d_states, d_Vstates, d_Simsize);
cudaMemcpy(h_Vstates,d_Vstates,gridsize*gridsize*sizeof(float),cudaMemcpyDeviceToHost);

fwrite(h_Vstates,sizeof(float),gridsize*gridsize,States);
myco =0;
}

myco++;
printf("%d \n",m);
}


    t2=clock();
    float diff  = ((float)t2-(float)t1)/CLOCKS_PER_SEC;
    printf("%f seconds \n",diff);

        delete[] h_states;
        fclose(States);
}

1 个答案:

答案 0 :(得分:2)

原始海报显然正在编译默认的CUDA架构(1.0),除其他外,它不支持双精度。显然正在编译正确的CUDA架构(2.0)解决了这个问题。

此评论已添加此社区维基解答,以便从未答复的问题队列中删除该问题。