为什么我使用MPI的并行代码比串行代码慢得多?

时间:2019-06-27 11:10:01

标签: parallel-processing mpi

我知道这不是第一次有人问这个问题,但我真的很困惑。我是MPI的新手,我试图为线性系统Ax = b实现Jocobi求解器,我想比较一下之间的时间并行和串行问题,但是对于大的N,我的MPI代码要慢得多。我不知道问题出在我的代码还是计算机上。顺便说一句,我在双核CPU上使用了2个进程。我的CPU模型是Intel®CoreTM i7-7000U,我使用mpic ++ j.cpp -oj和mpirun -np 2 j编译/运行它。例如,n = 10000的并行代码大约需要7秒和我的序列号为0.8秒。相差非常大。通过序列号,我使用g ++ -o serial serial.cpp和./serial执行了相同的代码。这就是我的并行代码:

   #include <iostream>
#include "mpi.h"
#include <cmath>
#include <cstdlib>
using namespace std;

#define n 10000

void CreateMatrix(double *A){


    double value;

     double sum=0;
     int j=0;
     for (int i=0;i<n*n;i++){
             j++;
            //cout<<"Enter matrix value row i="<<i<<endl;
            //cin>>value;
            value=(rand()%1000)+1;

            A[i]=value;


     }

     for(int i=0;i<n;i++){
         A[i*n+i]=(1000*n+128.5)+rand()%8984;//dummy valus in diagonal...
     }                                       //in order to be diagonally dominant




};
void CreateVector(double *b){


    double value;

        for(int i=0;i<n;i++){
            //cout<<"Enter vector b value row i="<<i<<endl;
            value=(rand()%974)-i*(rand()%20)+(rand()%1000);//random values
            b[i]=value;
        }




};
int JacobiMethod(double *A,double *b,double *x_out,int size,int rank,double tol,int max_iter){

    int n_local=n/size;
    double *xold;
    double *xnew;
    double *temp1=new double [n];
    double *temp2=new double [n];
    double *swap;
    int iterations=0;
    int k;
    double sum,sum1;
    double error=1;
    int flag;


    MPI_Allgather(b,n_local,MPI_DOUBLE,temp1,n_local,MPI_DOUBLE,MPI_COMM_WORLD);//gather all b matrices from each process to temp1
    xold=temp2;
    xnew=temp1;

    while(abs(error)>tol && iterations<max_iter){
        swap=xnew;
        xnew=xold;
        xold=swap;
        iterations+=1;

        for(int i=0;i<n_local;i++){

            k=i+rank*n_local;//tells us the location of diagonal element


            sum=0;
            for(int j=0;j<n;j++){
                if (j!=k){
                    sum+=A[j+n*i]*xold[j];//A is a continuus array so n*i gives us the next row

                }
            }

            x_out[i]=(b[i]-sum)/A[n*i+k];

        }
        MPI_Allgather(x_out,n_local,MPI_DOUBLE,xnew,n_local,MPI_DOUBLE,MPI_COMM_WORLD);
        sum1=0;
        for(int i=0;i<n;i++){
            sum1=sum1+pow((xnew[i]-xold[i]),2);
        }
        error=sqrt(sum1);
    }


    if (iterations>=max_iter){
        flag= -1;
    }
    else{
        flag= 0;

    }
    delete []A;
    delete []b;
    delete []temp1;
    delete  [] temp2;
    return flag;
};
void printResults( double *x_out,int size,int rank){
    int n_local=n/size;
    double *answ=new double [n];
    MPI_Gather(x_out, n_local, MPI_DOUBLE, answ, n_local, MPI_DOUBLE,0,MPI_COMM_WORLD);//gather all data to answ-->in one process(process 0)
    if (rank==0){
        cout<<"The algorith converges"<<endl;
        cout<<"The results are: "<<endl;
        for(int i=0;i<n;i++){
            //cout<<answ[i]<<endl;
        }
    }
    delete[] answ;
};

int main(){


double tol;//tolerance
int max_iter;

int converged;
//Mpi initialization
MPI_Init(NULL,NULL);
double *b_local;
double *A_local;
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);   
int rank;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);    
if (rank==0){
    b_local=new double[n];
    A_local=new double[n*n];
    cout<<"Enter talerance,number of iterations"<<endl;
    cin>>tol;
    cin>>max_iter;
    //Create A and scatter it to all process
    CreateMatrix(A_local);
    //Create b and scatter it to all process    
    CreateVector(b_local);

}
//data init
double *A=new double[n*n/size];
double *b=new double[n/size];
double *x_out=new double[n/size];
//brocast tol,max_iter to all processes
MPI_Bcast(&tol,1,MPI_DOUBLE,0,MPI_COMM_WORLD);//send to all processes
MPI_Bcast(&max_iter,1,MPI_INT,0,MPI_COMM_WORLD);    
//scatter vector b.Each process takes n/size
MPI_Scatter(b_local,n/size,MPI_DOUBLE,b,n/size,MPI_DOUBLE,0,MPI_COMM_WORLD);//here n_local cause we have only one column
//scatter it to all processes
MPI_Scatter(A_local,(n/size)*n,MPI_DOUBLE,A,(n/size)*n,MPI_DOUBLE,0,MPI_COMM_WORLD);//n_local*n--->number of elements in n/size rows
if (rank==0){
 delete [] b_local;
 delete [] A_local;
}
double time0,time1;
time0=MPI_Wtime();
converged=JacobiMethod(A,b,x_out,size,rank,tol,max_iter);
time1=MPI_Wtime();

if (converged==0){
    cout<<"Time needed for the jacobi algorith to be executed is :"<<time1-time0<<endl;
    printResults(x_out,size,rank);
}
else{
    cout<<"Jacobi doesnt converge"<<endl;
}
MPI_Finalize(); 
return 0;   

}

0 个答案:

没有答案