如何使用OpenACC积分微分方程?

时间:2019-07-06 02:54:19

标签: c++ openacc

我正在尝试使用OpenACC来计算具有各种初始条件的物理系统。我只有2个循环,第一个循环包含初始条件,第二个循环集成运动方程。我尝试过了

#pragma acc parallel loop copyout(X1,X2,Y1,Y2)

for(int j=0;j<nx;j++){ 

t= 0.;      

x2 = x2_ini+abs(x2_ini-x2_fin)*j/nx;     
y2= 0.;

px2 = 0.;
py2 = L/x2;


x1 = X1_neg(x2);  
y1 = 0.;

px1= 0.;
py1 = 0.;

#pragma acc loop seq

for(int i=0;i<n0;i++){      


    if(i%5==0){

        X1[j][i/5]=x1;
        Y1[j][i/5]=y1;
        X2[j][i/5]=x2;
        Y2[j][i/5]=y2;

    }


    if(x1>=0){
        Qpos(Q1, Q2, x1, y1);
        }else
        Qneg(Q1, Q2, x1, y1);

    if(x2>=0){
        Qpos(Q3, Q4, x2, y2);
        }else
        Qneg(Q3, Q4, x2, y2);

    P1=2.*(px1*Q1+py1*Q2);
    P2=2.*(py1*Q1-px1*Q2);
    P3=2.*(px2*Q3+py2*Q4);
    P4=2.*(py2*Q3-px2*Q4);
    R(R1,Q1,Q2);
    R(R2,Q3,Q4);
    Energ(E,x1,x2,y1,y2,px1,px2,py1,py2); 

    V1=(Q3*Q3+Q4*Q4)*P1/4.;
    V2=(Q3*Q3+Q4*Q4)*P2/4.;
    V3=(Q1*Q1+Q2*Q2)*P3/4.;
    V4=(Q1*Q1+Q2*Q2)*P4/4.;

    r_12(r12,Q1,Q2,Q3,Q4);
    dr_12(dr12,Q1,Q2,Q3,Q4,V1,V2,V3,V4);

    //

    A1=A_1(Q1,Q2,Q3,Q4,V1,V3,V4,r12,E);
    A2=A_2(Q1,Q2,Q3,Q4,V2,V3,V4,r12,E);
    A3=A_3(Q1,Q2,Q3,Q4,V1,V2,V3,r12,E);
    A4=A_4(Q1,Q2,Q3,Q4,V1,V2,V4,r12,E); 
    J1=J_1(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
    J2=J_2(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
    J3=J_3(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);
    J4=J_4(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);

    dTao(dtao,Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,A3,A4,J1,J2,J3,J4);

    Q1_=Q_pred(Q1,V1,A1,J1,dtao);
    Q2_=Q_pred(Q2,V2,A2,J2,dtao);
    Q3_=Q_pred(Q3,V3,A3,J3,dtao);
    Q4_=Q_pred(Q4,V4,A4,J4,dtao);       
    V1_=V_pred(V1,A1,J1,dtao);
    V2_=V_pred(V2,A2,J2,dtao);
    V3_=V_pred(V3,A3,J3,dtao);
    V4_=V_pred(V4,A4,J4,dtao);

    r_12(r12_,Q1_,Q2_,Q3_,Q4_);
    dr_12(dr12_,Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_);

    A1_=A_1(Q1_,Q2_,Q3_,Q4_,V1_,V3_,V4_,r12_,E);
    A2_=A_2(Q1_,Q2_,Q3_,Q4_,V2_,V3_,V4_,r12_,E);
    A3_=A_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,r12_,E);
    A4_=A_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V4_,r12_,E);    
    J1_=J_1(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
    J2_=J_2(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
    J3_=J_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);
    J4_=J_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);  

    //


    V1_=V1+(A1_+A1)*dtao/2.-(J1_-J1)*dtao*dtao/12.;
    V2_=V2+(A2_+A2)*dtao/2.-(J2_-J2)*dtao*dtao/12.;
    V3_=V3+(A3_+A3)*dtao/2.-(J3_-J3)*dtao*dtao/12.;
    V4_=V4+(A4_+A4)*dtao/2.-(J4_-J4)*dtao*dtao/12.; 

    Q1_=Q1+(V1_+V1)*dtao/2.-(A1_-A1)*dtao*dtao/10.+(J1_+J1)*dtao*dtao*dtao/120.;
    Q2_=Q2+(V2_+V2)*dtao/2.-(A2_-A2)*dtao*dtao/10.+(J2_+J2)*dtao*dtao*dtao/120.;
    Q3_=Q3+(V3_+V3)*dtao/2.-(A3_-A3)*dtao*dtao/10.+(J3_+J3)*dtao*dtao*dtao/120.;   
    Q4_=Q4+(V4_+V4)*dtao/2.-(A4_-A4)*dtao*dtao/10.+(J4_+J4)*dtao*dtao*dtao/120.;

    Q1=Q1_;
    Q2=Q2_;
    Q3=Q3_;
    Q4=Q4_;
    V1=V1_;
    V2=V2_;
    V3=V3_;
    V4=V4_;

    P1=4.*V1/(Q3*Q3+Q4*Q4);
    P2=4.*V2/(Q3*Q3+Q4*Q4);
    P3=4.*V3/(Q1*Q1+Q2*Q2);
    P4=4.*V4/(Q1*Q1+Q2*Q2);

    dt=R1*R1*R2*R2*dtao;
    t=t+dt;

    x1=Q1*Q1-Q2*Q2;
    x2=Q3*Q3-Q4*Q4;
    y1=2.*Q1*Q2;
    y2=2.*Q3*Q4;

    px1=(Q1*P1-Q2*P2)/(2.*(Q1*Q1+Q2*Q2));
    py1=(Q2*P1+Q1*P2)/(2.*(Q1*Q1+Q2*Q2));
    px2=(Q3*P3-Q4*P4)/(2.*(Q3*Q3+Q4*Q4));
    py2=(Q4*P3+Q3*P4)/(2.*(Q3*Q3+Q4*Q4));



    }



}

但是在使用pgc++ -acc code.cpp -Minfo=accel -ta=tesla:cuda9.2进行编译时,我收到很多错误,因此我怀疑自己做得不好。我需要的所有信息都存储在X和Y数组中。对于每个初始条件集,我定义新的变量 Q V A J P < / strong>,...大多随函数而变化,在第二个循环结束时,将设置下一时间x1,y1,x2,y2的新值,然后重复执行。

我需要更改什么才能编译代码?我还想知道是否还有其他可能的方法来提取信息(例如,指针或在文件中打印值),因为我不确定并行化时是否允许使用该方法,并且普通数组不能存储很多信息。

1 个答案:

答案 0 :(得分:1)

发生了一些问题。

首先,您无法从设备执行字符串操作或文件I / O。您的原始代码片段中缺少此内容。您需要重新考虑如何将结果存储到文件中。现在,我只是在使用OpenACC时通过宏禁用了它。

第二,您正在使用向量。可以使用向量,但存在问题。它们不是线程安全的,实际上是具有三个指针的类。 Data子句执行浅表复制,因此仅复制指针,而不复制指针指向的数据。虽然很棘手,但是您可以执行手动深层复制或使用CUDA统一内存(-ta = tesla:managed),但是我只是使它们成为常规分配的数组,因为您实际上并不需要将它们作为向量。

最后,您的表现将非常糟糕。 “ nx”仅为10,因此严重不足的代码利用了该设备。您可能希望nx在成千上万,才能看到GPU的真正好处。

此外,由于使用了大量局部变量,因此寄存器使用率很高。高寄存器使用率导致占用率低,这通常会导致性能降低。尽管解决此问题的唯一方法是将大循环分成几个小循环,然后将中间结果存储在全局数组中。但是,无论如何,您可能仍需要执行类似的操作来处理文件I / O问题。

% cat pcc.cpp
#include<iostream>
#include<math.h>
#include<fstream>
#include<cstdlib>
#include<iomanip>
#include<ctime>
#include<vector>

//Compile using  pgc++ -acc pcc.cpp -Minfo=accel -ta=tesla:cuda9.2

using namespace std;
typedef std::vector<double> RealVector;

double Q_pred(double,double,double,double,double);
double V_pred(double,double,double,double);
double A_1(double,double,double,double,double,double,double,double,double);
double A_2(double,double,double,double,double,double,double,double,double);
double A_3(double,double,double,double,double,double,double,double,double);
double A_4(double,double,double,double,double,double,double,double,double);
double J_1(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_2(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_3(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_4(double,double,double,double,double,double,double,double,double,double,double, double,double);
void Qpos(double&,double&,double,double);
void Qneg(double&,double&,double,double);
void R(double&,double,double);
void Energ(double&,double,double,double,double,double,double,double,double);
void r_12(double&,double,double,double,double);
void dr_12(double&,double,double,double,double,double,double,double,double);
void dTao(double&,double,double,double,double,double,double,double,double,double,double,double,double,double,double,double,double);
double X1_pos(double,double);
double X1_neg(double,double);


int main(){

        int n0,nx,N,fac;
        double x1,x2,y1,y2,x2_ini,x2_fin;
        double px1,px2,py1,py2;
        double Q1,Q2,Q3,Q4;
        double V1,V2,V3,V4;
        double A1,A2,A3,A4;
        double J1,J2,J3,J4;
        double Q1_,Q2_,Q3_,Q4_;
        double V1_,V2_,V3_,V4_;
        double A1_,A2_,A3_,A4_;
        double J1_,J2_,J3_,J4_;
        double P1,P2,P3,P4;
        double dtao,R1,R2,r12,dr12,r12_,dr12_;
        double dt,t,E,L;
        ofstream points,graph;
        ofstream electron1,electron2;
        string x20;

        nx = 10;
        n0 = 2097152;
        N = 262144;
        fac=n0/N;

        x2_ini = 7.824;
        x2_fin = 10.;
        L = 0.28284271247461900976;

//      RealVector X1(N),Y1(N),X2(N),Y2(N),T(N),T_(N);
        double *X1 = new double[N];
        double *Y1 = new double[N];
        double *X2 = new double[N];
        double *Y2 = new double[N];
        double *T = new double[N];
        double *T_ = new double[N];

#pragma acc parallel loop copyout(X1[:N],X2[:N],Y1[:N],Y2[:N],T[:N],T_[:N])
        for(int j=0;j<nx;j++){
                t= 0.;
                x2 = x2_ini+abs(x2_ini-x2_fin)*j/(nx-1);
                y2= 0.;

                px2 = 0.;
                py2 = L/x2;

                x1 = X1_neg(x2,L);
                y1 = 0.;
                px1= 0.;
                py1 = 0.;
#ifndef _OPENACC
                x20=to_string(x2);
#endif

#pragma acc loop seq
                for(int i=0;i<n0;i++){
                        if(i%fac==0){
                                T[i/fac]=t;
                                X1[i/fac]=x1;
                                Y1[i/fac]=y1;
                                X2[i/fac]=x2;
                                Y2[i/fac]=y2;
                        }

                        if(x1>=0){
                                Qpos(Q1, Q2, x1, y1);
                        }else
                                Qneg(Q1, Q2, x1, y1);

                        if(x2>=0){
                                Qpos(Q3, Q4, x2, y2);
                        }else
                                Qneg(Q3, Q4, x2, y2);

                        P1=2.*(px1*Q1+py1*Q2);
                        P2=2.*(py1*Q1-px1*Q2);
                        P3=2.*(px2*Q3+py2*Q4);
                        P4=2.*(py2*Q3-px2*Q4);
                        R(R1,Q1,Q2);
                        R(R2,Q3,Q4);
                        Energ(E,x1,x2,y1,y2,px1,px2,py1,py2);

                        V1=(Q3*Q3+Q4*Q4)*P1/4.;
                        V2=(Q3*Q3+Q4*Q4)*P2/4.;
                        V3=(Q1*Q1+Q2*Q2)*P3/4.;
                        V4=(Q1*Q1+Q2*Q2)*P4/4.;

                        r_12(r12,Q1,Q2,Q3,Q4);
                        dr_12(dr12,Q1,Q2,Q3,Q4,V1,V2,V3,V4);

                        //

                        A1=A_1(Q1,Q2,Q3,Q4,V1,V3,V4,r12,E);
                        A2=A_2(Q1,Q2,Q3,Q4,V2,V3,V4,r12,E);
                        A3=A_3(Q1,Q2,Q3,Q4,V1,V2,V3,r12,E);
                        A4=A_4(Q1,Q2,Q3,Q4,V1,V2,V4,r12,E);
                        J1=J_1(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
                        J2=J_2(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
                        J3=J_3(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);
                        J4=J_4(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);

                        dTao(dtao,Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,A3,A4,J1,J2,J3,J4);

                        Q1_=Q_pred(Q1,V1,A1,J1,dtao);
                        Q2_=Q_pred(Q2,V2,A2,J2,dtao);
                        Q3_=Q_pred(Q3,V3,A3,J3,dtao);
                        Q4_=Q_pred(Q4,V4,A4,J4,dtao);
                        V1_=V_pred(V1,A1,J1,dtao);
                        V2_=V_pred(V2,A2,J2,dtao);
                        V3_=V_pred(V3,A3,J3,dtao);
                        V4_=V_pred(V4,A4,J4,dtao);

                        r_12(r12_,Q1_,Q2_,Q3_,Q4_);
                        dr_12(dr12_,Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_);

                        A1_=A_1(Q1_,Q2_,Q3_,Q4_,V1_,V3_,V4_,r12_,E);
                        A2_=A_2(Q1_,Q2_,Q3_,Q4_,V2_,V3_,V4_,r12_,E);
                        A3_=A_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,r12_,E);
                        A4_=A_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V4_,r12_,E);
                        J1_=J_1(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
                        J2_=J_2(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
                        J3_=J_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);
                        J4_=J_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);

                        //


                        V1_=V1+(A1_+A1)*dtao/2.-(J1_-J1)*dtao*dtao/12.;
                        V2_=V2+(A2_+A2)*dtao/2.-(J2_-J2)*dtao*dtao/12.;
                        V3_=V3+(A3_+A3)*dtao/2.-(J3_-J3)*dtao*dtao/12.;
                        V4_=V4+(A4_+A4)*dtao/2.-(J4_-J4)*dtao*dtao/12.;

                        Q1_=Q1+(V1_+V1)*dtao/2.-(A1_-A1)*dtao*dtao/10.+(J1_+J1)*dtao*dtao*dtao/120.;
                        Q2_=Q2+(V2_+V2)*dtao/2.-(A2_-A2)*dtao*dtao/10.+(J2_+J2)*dtao*dtao*dtao/120.;
                        Q3_=Q3+(V3_+V3)*dtao/2.-(A3_-A3)*dtao*dtao/10.+(J3_+J3)*dtao*dtao*dtao/120.;
                        Q4_=Q4+(V4_+V4)*dtao/2.-(A4_-A4)*dtao*dtao/10.+(J4_+J4)*dtao*dtao*dtao/120.;

                        Q1=Q1_;
                        Q2=Q2_;
                        Q3=Q3_;
                        Q4=Q4_;
                        V1=V1_;
                        V2=V2_;
                        V3=V3_;
                        V4=V4_;

                        P1=4.*V1/(Q3*Q3+Q4*Q4);
                        P2=4.*V2/(Q3*Q3+Q4*Q4);
                        P3=4.*V3/(Q1*Q1+Q2*Q2);
                        P4=4.*V4/(Q1*Q1+Q2*Q2);

                        dt=R1*R1*R2*R2*dtao;
                        t=t+dt;

                        x1=Q1*Q1-Q2*Q2;
                        x2=Q3*Q3-Q4*Q4;
                        y1=2.*Q1*Q2;
                        y2=2.*Q3*Q4;

                        px1=(Q1*P1-Q2*P2)/(2.*(Q1*Q1+Q2*Q2));
                        py1=(Q2*P1+Q1*P2)/(2.*(Q1*Q1+Q2*Q2));
                        px2=(Q3*P3-Q4*P4)/(2.*(Q3*Q3+Q4*Q4));
                        py2=(Q4*P3+Q3*P4)/(2.*(Q3*Q3+Q4*Q4));



                }

                dt=t/(N-1);

#ifndef _OPENACC
                for(int i=0; i<N; i++)
                        T_[i]=i*dt;

                electron1.open(("electron1_neg_x20_"+x20+".dat").c_str());
                electron2.open(("electron2_neg_x20_"+x20+".dat").c_str());

                for(int i=0; i<N ;i++){

                        electron1<<T_[i]<<" "<<X1[i]<<" "<<Y1[i]<<endl;
                        electron2<<T_[i]<<" "<<X2[i]<<" "<<Y2[i]<<endl;

                }

                electron1.close();
                electron2.close();
#endif

        }

        delete [] X1;
        delete [] X2;
        delete [] Y1;
        delete [] Y2;
        delete [] T;
        delete [] T_;

        return 0;

}

double Q_pred(double Q, double V, double A, double J, double dtao){
        return Q+V*dtao+A*dtao*dtao/2.+J*dtao*dtao*dtao/6.;
}

double V_pred(double V, double A, double J, double dtao){
        return V+A*dtao+J*dtao*dtao/2.;
}

double A_1(double Q1, double Q2, double Q3, double Q4, double V1, double V3, double V4,double r12, double E){
        return 2.*(Q3*V3+Q4*V4)*V1/(Q3*Q3+Q4*Q4)+(Q3*Q3+Q4*Q4)*(-(4.*Q1*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q1+2.*Q1*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q1+(Q4*Q4-Q3*Q3)*Q1-2.*Q2*Q3*Q4)/pow(r12,3.)))/4.;
}

double A_2(double Q1, double Q2, double Q3, double Q4, double V2, double V3, double V4,double r12, double E){
        return 2.*(Q3*V3+Q4*V4)*V2/(Q3*Q3+Q4*Q4)+(Q3*Q3+Q4*Q4)*(-4.*Q2*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)+4.*Q2-2.*Q2*(Q3*Q3+Q4*Q4)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q2-(Q4*Q4-Q3*Q3)*Q2-2.*Q1*Q3*Q4)/pow(r12,3.))/4.;
}

double A_3(double Q1, double Q2, double Q3, double Q4, double V1, double V2, double V3,double r12, double E){
        return 2.*(Q1*V1+Q2*V2)*V3/(Q1*Q1+Q2*Q2)+(Q1*Q1+Q2*Q2)*(-4.*Q3*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)+4.*Q3-2.*Q3*(Q1*Q1+Q2*Q2)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q3+(Q2*Q2-Q1*Q1)*Q3-2.*Q1*Q2*Q4)/pow(r12,3.))/4.;
}

double A_4(double Q1, double Q2, double Q3, double Q4, double V1, double V2, double V4,double r12, double E){
        return 2.*(Q1*V1+Q2*V2)*V4/(Q1*Q1+Q2*Q2)+(Q1*Q1+Q2*Q2)*(-4.*Q4*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)+4.*Q4-2.*Q4*(Q1*Q1+Q2*Q2)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q4-(Q2*Q2-Q1*Q1)*Q4-2.*Q1*Q2*Q3)/pow(r12,3.))/4.;
}

double J_1(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A3, double A4, double r12,double dr12 ,double E){
        return 2.*(V3*V3+V4*V4+Q3*A3+Q4*A4)*V1/(Q3*Q3+Q4*Q4)+(Q3*V3+Q4*V4)*(-(4.*Q1*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q1+2.*Q1*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q1+(Q4*Q4-Q3*Q3)*Q1-2.*Q2*Q3*Q4)/pow(r12,3.)))+(Q3*Q3+Q4*Q4)*(4 *(V1 - ((V3*V3 + V4*V4) *V1)/pow(Q1*Q1 + Q2*Q2,2.) + (4 *Q1 *(V3*V3 + V4*V4) *(Q1 *V1 + Q2 *V2))/pow(Q1*Q1 + Q2*Q2,3.) - (2 *Q1 *(V3 *A3 + V4 *A4))/pow(Q1*Q1 + Q2*Q2,2.))+(1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(Q3*Q3 *V1 +2 *Q1 *Q3 *V3 + Q4 *(Q4 *V1 + 2 *Q1 *V4)) + 2 *Q1 *(Q3*Q3 + Q4*Q4)* dr12)+ (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4)) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4)) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(3 *Q1*Q1 *V1 + Q2*Q2 *V1 + (-Q3*Q3 + Q4*Q4) *V1 - 2 *Q3 *Q4 *V2 - 2 *Q2 *(Q4 *V3 + Q3 *V4) + 2 *Q1 *(Q2 *V2 - Q3 *V3 + Q4 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4))* dr12))/4.;
}

double J_2(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A3, double A4, double r12,double dr12 , double E){
        return 2.*(V3*V3+V4*V4+Q3*A3+Q4*A4)*V2/(Q3*Q3+Q4*Q4)+(Q3*V3+Q4*V4)*(-(4.*Q2*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q2+2.*Q2*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q2-(Q4*Q4-Q3*Q3)*Q2-2.*Q1*Q3*Q4)/pow(r12,3.)))+(Q3*Q3+Q4*Q4)*(4 *(V2 - ((V3*V3 + V4*V4) *V2)/pow(Q1*Q1 + Q2*Q2,2.) + (4 *Q2 *(V3*V3 + V4*V4) *(Q1 *V1 + Q2 *V2))/pow(Q1*Q1 + Q2*Q2,3.) - ( 2 *Q2 *(V3 *A3 + V4 *A4))/pow(Q1*Q1 + Q2*Q2,2.))+(1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(Q3*Q3 *V2 + 2 *Q2 *Q3 *V3 + Q4 *(Q4 *V2 + 2 *Q2 *V4)) + 2 *Q2 *(Q3*Q3 + Q4*Q4) *dr12) + (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) - 2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) -2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(Q1*Q1 *V2 + Q3*Q3 *V2 + (3 *Q2*Q2 - Q4*Q4) *V2 +Q3 *(-2 *Q4 *V1 + 2 *Q2 *V3) -2 *Q2 *Q4 *V4 + 2 *Q1 *(Q2 *V1 - Q4 *V3 - Q3 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) - 2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4)* dr12))/4.;
}

double J_3(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A1, double A2, double r12,double dr12 , double E){
        return 2.*(V1*V1+V2*V2+Q1*A1+Q2*A2)*V3/(Q1*Q1+Q2*Q2)+(Q1*V1+Q2*V2)*(-(4.*Q3*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)-4.*Q3+2.*Q3*(Q1*Q1+Q2*Q2)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q3+(Q2*Q2-Q1*Q1)*Q3-2.*Q1*Q2*Q4)/pow(r12,3.)))+(Q1*Q1+Q2*Q2)*(4 *(V3 - ((V1*V1 + V2*V2) *V3)/pow(Q3*Q3 + Q4*Q4,2.) + (4 *Q3 *(V1*V1 + V2*V2) *(Q3 *V3 + Q4 *V4))/pow(Q3*Q3 + Q4*Q4,3.) - (2 *Q3 *(V1 *A1 + V2 *A2))/pow(Q3*Q3 + Q4*Q4,2.))+  (1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(2 *Q3 *(Q1 *V1 + Q2 *V2) + (Q1*Q1 + Q2*Q2) *V3) + 2 *(Q1*Q1 + Q2*Q2) *Q3 *dr12) + (1./(pow(r12,4.)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(Q2 *(-2 *Q4 *V1 + 2 *Q3 *V2) - Q1*Q1 *V3 + Q2*Q2 *V3 + (3 *Q3*Q3 + Q4*Q4) *V3 + 2 *Q3 *Q4 *V4 - 2 *Q1 *(Q3 *V1 + Q4 *V2 + Q2 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *dr12))/4.;
}

double J_4(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A1, double A2, double r12,double dr12 , double E){
        return 2.*(V1*V1+V2*V2+Q1*A1+Q2*A2)*V4/(Q1*Q1+Q2*Q2)+(Q1*V1+Q2*V2)*(-(4.*Q4*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)-4.*Q4+2.*Q4*(Q1*Q1+Q2*Q2)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q4-(Q2*Q2-Q1*Q1)*Q4-2.*Q1*Q2*Q3)/pow(r12,3.)))+(Q1*Q1+Q2*Q2)*(4 *(V4 - ((V1*V1 + V2*V2) *V4)/pow(Q3*Q3 + Q4*Q4,2.) + (4 *Q4 *(V1*V1 + V2*V2) *(Q3 *V3 + Q4 *V4))/pow(Q3*Q3 + Q4*Q4,3.) - ( 2 *Q4 *(V1 *A1 + V2 *A2))/pow(Q3*Q3 + Q4*Q4,2.))+ (1./(r12*r12))*(-2 *r12 *(-1 + E* r12) *(2 *Q4 *(Q1 *V1 + Q2 *V2) + (Q1*Q1 + Q2*Q2) *V4) - 2 *(Q1*Q1 + Q2*Q2) *Q4 *dr12) + (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(2 *Q1 *(Q4 *V1 -Q3 *V2) + 2 *Q3 *Q4 *V3 - 2 *Q2 *(Q3 *V1 +Q4 *V2 + Q1 *V3) + (Q1*Q1 - Q2*Q2 + Q3*Q3 + 3 *Q4*Q4) *V4) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *dr12))/4.;
}



void Qpos(double& Qx, double& Qy, double x, double y){
        Qx=pow(0.5*(pow(x*x+y*y,0.5)+x),0.5);
        Qy=y/(2.*pow(0.5*(pow(x*x+y*y,0.5)+x),0.5));
        return;
}

void Qneg(double& Qx, double& Qy, double x, double y){
        Qy=pow(0.5*(pow(x*x+y*y,0.5)-x),0.5);
        Qx=y/(2.*pow(0.5*(pow(x*x+y*y,0.5)-x),0.5));
        return;
}

void R(double& Ri, double Qx, double Qy){
        Ri=pow(Qx*Qx+Qy*Qy,0.5);
        return;
}

void Energ(double& E, double x1, double x2, double y1, double y2, double px1, double px2, double py1, double py2){
        E=(px1*px1+py1*py1)/2.+(px2*px2+py2*py2)/2.-2./pow(x1*x1+y1*y1,0.5)-2./pow(x2*x2+y2*y2,0.5)+1./pow(pow(x1-x2,2.)+pow(y1-y2,2.),0.5);
        return;
}



void r_12(double& r12,double Q1, double Q2, double Q3, double Q4){

        r12=pow(pow(Q1*Q1+Q2*Q2,2.)+pow(Q3*Q3+Q4*Q4,2.)-2.*pow(Q1*Q3+Q2*Q4,2.)+2.*pow(Q1*Q4-Q2*Q3,2.),0.5);

        return;
}

void dr_12(double& dr12,double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4){

        dr12=(2.*((Q1*Q1 + Q2*Q2)*(Q1 *V1 + Q2 *V2) + (Q2 *Q3 - Q1 *Q4) *(-Q4 *V1 + Q3 *V2 + Q2 *V3 - Q1 *V4) - (Q1 *Q3 + Q2 *Q4) *(Q3 *V1 + Q4 *V2 + Q1 *V3 + Q2 *V4) + (Q3*Q3 +Q4*Q4) *(Q3 *V3 + Q4 *V4)))/pow(pow(Q1*Q1 + Q2*Q2,2) + 2*pow(Q2 *Q3 - Q1 *Q4,2.) -2 *pow(Q1 *Q3 + Q2 *Q4,2.) + pow(Q3*Q3 + Q4*Q4,2.),0.5);

        return;
}

void dTao(double& dtao,double Q1,double Q2,double Q3,double Q4,double V1,double V2,double V3,double V4,double A1,double A2,double A3,double A4,double J1,double J2,double J3,double J4){

        double modQ=Q1*Q1+Q2*Q2+Q3*Q3+Q4*Q4;
        double modV=V1*V1+V2*V2+V3*V3+V4*V4;
        double modA=A1*A1+A2*A2+A3*A3+A4*A4;
        double modJ=J1*J1+J2*J2+J3*J3+J4*J4;

        dtao=pow(0.0000001*(modA*modQ+modV*modV)/(modJ*modV+modA*modA),0.5);

        return;
}

double X1_pos(double x2, double L){

        return (-2. + L*L/x2 + 2.*x2 + pow(L*L*L*L - 20.*L*L*x2 + 68.*x2*x2 + 4.*L*L *x2*x2 - 40.*x2*x2*x2 + 4. *x2*x2*x2*x2,0.5)/x2)/(2.*(2.+ L*L/(x2*x2) - 4./x2));

}

double X1_neg(double x2, double L){

        return (-2. + L*L/x2 + 2.*x2 - pow(L*L*L*L - 20.*L*L*x2 + 68.*x2*x2 + 4.*L*L *x2*x2 - 40.*x2*x2*x2 + 4. *x2*x2*x2*x2,0.5)/x2)/(2.*(2.+ L*L/(x2*x2) - 4./x2));

}

% pgc++ -ta=tesla -Minfo=accel pcc.cpp -o pcc.exe ; ./pcc.exe
main:
     74, Generating copyout(T_[:N],X1[:N],X2[:N],Y2[:N],Y1[:N],T[:N])
         Generating Tesla code
         77, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
         94, #pragma acc loop seq
     94, Complex loop carried dependence of T->,X1-> prevents parallelization
         Loop carried dependence of T-> prevents parallelization
         Loop carried backward dependence of T-> prevents vectorization
         Complex loop carried dependence of Y1-> prevents parallelization
         Loop carried dependence of X2-> prevents parallelization
         Loop carried backward dependence of X2-> prevents vectorization
         Complex loop carried dependence of X2-> prevents parallelization
         Loop carried dependence of X1-> prevents parallelization
         Loop carried backward dependence of X1-> prevents vectorization
         Loop carried dependence of Y2->,Y1-> prevents parallelization
         Loop carried backward dependence of Y2->,Y1-> prevents vectorization
         Complex loop carried dependence of Y2-> prevents parallelization
         Loop carried scalar dependence for y2 at line 111
         Loop carried scalar dependence for py1 at line 113
         Loop carried scalar dependence for py2 at line 116
         Loop carried scalar dependence for Q4 at line 122
         Loop carried scalar dependence for Q2 at line 123,124
         Loop carried scalar dependence for Q4 at line 126,127,131,132,133,134,135,136,137,138,140,145
         Loop carried scalar dependence for Q1 at line 113,114,117,123,124,126,127,131,132,133,134,135,136,137,138,140,142,171
         Loop carried scalar dependence for py1 at line 114
         Loop carried scalar dependence for py2 at line 115
         Loop carried scalar dependence for Q2 at line 117
         Loop carried scalar dependence for y2 at line 119
         Loop carried scalar dependence for Q4 at line 121
         Loop carried scalar dependence for Q2 at line 126,127,131,132,133,134,135,136,137,138,140,143,172
         Loop carried scalar dependence for Q3 at line 115,116,118,121,122,126,127,131,132,133,134,135,136,137,138,140,144
         Loop carried scalar dependence for Q4 at line 174
         Loop carried scalar dependence for Q3 at line 173
         Loop carried scalar dependence for t at line 96
         Loop carried scalar dependence for y2 at line 100
         Loop carried scalar dependence for x1 at line 103
         Loop carried scalar dependence for y1 at line 104
         Loop carried scalar dependence for py2 at line 119
         Loop carried scalar dependence for y1 at line 98,106
         Loop carried scalar dependence for x2 at line 108
         Loop carried scalar dependence for y2 at line 109
         Loop carried scalar dependence for x1 at line 119,97,104,106
         Loop carried scalar dependence for x2 at line 119,99,109,111
         Loop carried scalar dependence for y1 at line 119
         Loop carried scalar dependence for Q2 at line 113,114
         Loop carried scalar dependence for Q4 at line 118
         Loop carried scalar dependence for px1 at line 113,114
         Loop carried scalar dependence for px2 at line 119
         Loop carried scalar dependence for Q4 at line 115,116
         Loop carried scalar dependence for px1 at line 119
         Loop carried scalar dependence for px2 at line 115,116
         Loop carried scalar dependence for py1 at line 119
Q_pred(double, double, double, double, double):
    240, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
V_pred(double, double, double, double):
    244, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
A_1(double, double, double, double, double, double, double, double, double):
    248, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
A_2(double, double, double, double, double, double, double, double, double):
    252, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
A_3(double, double, double, double, double, double, double, double, double):
    256, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
A_4(double, double, double, double, double, double, double, double, double):
    260, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
J_1(double, double, double, double, double, double, double, double, double, double, double, double, double):
    264, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
J_2(double, double, double, double, double, double, double, double, double, double, double, double, double):
    268, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
J_3(double, double, double, double, double, double, double, double, double, double, double, double, double):
    272, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
J_4(double, double, double, double, double, double, double, double, double, double, double, double, double):
    276, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
Qpos(double &, double &, double, double):
    282, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
Qneg(double &, double &, double, double):
    288, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
R(double &, double, double):
    294, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
Energ(double &, double, double, double, double, double, double, double, double):
    299, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
r_12(double &, double, double, double, double):
    306, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
dr_12(double &, double, double, double, double, double, double, double, double):
    313, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
dTao(double &, double, double, double, double, double, double, double, double, double, double, double, double, double, double, double, double):
    320, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
X1_neg(double, double):
    338, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
std::abs(double):
      1, include "iostream"
          35, include "iostream"
                4, include "ostream"
                    38, include "ios"
                         42, include "ios_base.h"
                              41, include "locale_classes.h"
                                   40, include "string"
                                        52, include "basic_string.h"
                                           6391, include "string_conversions.h"
                                                  41, include "cstdlib"
                                                       77, include "std_abs.h"
                                                            71, Generating implicit acc routine seq
                                                                Generating acc routine seq
                                                                Generating Tesla code
__gnu_cxx::__promote_2<T1, T2, __gnu_cxx::__promote<T1, std::__is_integer<T1>::__value>::__type, __gnu_cxx::__promote<T2, std::__is_integer<T2>::__value>::__type>::__type std::pow<double, int>(T1, T2):
      2, include "math.h"
          59, include "math.h"
               36, include "cmath"
                   416, Generating implicit acc routine seq
                        Generating acc routine seq
                        Generating Tesla code

Accelerator Kernel Timing data
pcc.cpp
  main  NVIDIA  devicenum=0
    time(us): 35,476,363
    74: compute region reached 1 time
        74: kernel launched 1 time
            grid: [1]  block: [10]
             device time(us): total=35,475,345 max=35,475,345 min=35,475,345 avg=35,475,345
            elapsed time(us): total=35,475,684 max=35,475,684 min=35,475,684 avg=35,475,684
    74: data region reached 2 times
        227: data copyout transfers: 6
             device time(us): total=1,018 max=186 min=166 avg=169