Question

我正在将3-D Jacobi求解器从纯MPI转换为混合MPI + OpenMP。我有一个192x192x192数组，它在1-D分解中的纯MPI中的24个进程之间进行划分，即每个进程都有192/24 x 192 x 192 = 8 x 192 x 192个数据块。现在我做：

for(i=0 ; i <= 7; i++)
  for(j=0; j<= 191; j++)
    for(k=0; k<= 191; k++)
     {
      unew[i][j][k] = 1/6.0 * (u[i+1][j][k]+u[i-1][j][k]+
                               u[i][j+1][k]+u[i][j-1][k]+
                               u[i][j][k+1]+u[i][j][k-1]);
     }

此更新每个进程大约需要60秒。现在使用Hybrid MPI，我运行两个进程（每个进程1个进程--bind-to socket --map-by socket和OMP_PROC_PLACES=cores进程OMP_PROC_BIND=close）。我为每个MPI进程创建了12个线程（即每个套接字或处理器12个线程）。现在每个MPI进程都有一个大小为192/2 x 192 x 192 = 96x192x192元素的数组。每个线程都在每个进程拥有的数组的96/12 x 192 x 192 = 8 x 192 x 192 部分上工作。我使用线程进行相同的三次循环更新，但每个线程的时间大约为76秒。负载平衡在两个问题中都是完美的。 可能导致性能下降的原因是什么？是错误共享，因为线程可能使高速缓存行无效接近彼此的数据块？如果是，那么如何降低性能下降？（我故意没有提到重影数据，但最初我并没有重复与计算的通信。）

在回复以下评论时，我发布了代码。为长MWE道歉，但你可以非常安全地忽略（1）头文件声明（2）变量声明（3）内存分配例程（4）笛卡尔拓扑的形成（5）并行设置边界条件使用OpenMP并行区域（6）声明MPI_Type_subarray数据类型（7）MPI_Isend()和MPI_Irecv()调用和只关注（a）INDEPENDENT UPDATE OpenMP从这里调用并行区域（b）independent_update(...)例程。

/* IGNORE THIS PORTION */
#include<mpi.h>
#include<omp.h>


#include<stdio.h>
#include<stdlib.h>
#include<math.h>


#define MIN(a,b) (a < b ? a : b)
#define Tol 0.00001

 /* IGNORE THIS ROUTINE */
void input(int *X, int *Y, int *Z)
{
    int a=193, b=193, c=193;
    *X = a;
    *Y = b;
    *Z = c;
}
 /* IGNORE THIS ROUTINE */
float*** allocate_mem(int X, int Y, int Z)
{
    int i,j;
    float ***matrix;
    float *arr;

    arr = (float*)calloc(X*Y*Z, sizeof(float));

    matrix = (float***)calloc(X, sizeof(float**));

    for(i = 0 ; i<= X-1; i++)
        matrix[i] = (float**)calloc(Y, sizeof(float*));

    for(i = 0 ; i <= X-1; i++)
        for(j=0; j<= Y-1; j++)
            matrix[i][j] = &(arr[i*Y*Z + j*Z]); 

    return matrix ; 
}

/* THIS ROUTINE IS IMPORTANT */

float independent_update(float ***old, float ***new, int NX, int NY, int NZ, int tID, int chunk)
{
    int i,j,k, start, end;
    float error = 0.0; 
    float diff;


        start = tID * chunk + 1;
        end = MIN( (tID+1)*chunk, NX-2 ); 

        for(i = start; i <= end ; i++)
        {
            for(j = 1; j<= NY-2; j++)
            {
                #pragma omp simd
                for(k = 1; k<= NZ-2; k++)
                {
                    new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
                    diff = 1.0 - new[i][j][k]; 
                    diff = (diff > 0 ? diff : -1.0 * diff ); 
                    if(diff > error)
                        error = diff;
                }
            }
        }   
    return error;  

}


int main(int argc, char *argv[])
{

    /* IGNORE VARIABLE DECLARATION */
    int size, rank;                         //Size of old_comm and rank of process
    int i, j, k,l;                          //General loop variables
    MPI_Comm old_comm, new_comm;            //MPI_COMM_WORLD handle and for MPI_Cart_create()
    int N[3];                               //For taking input of size of matrix from user
    int P;                                  //Represent number of processes i.e. same as size
    int dims[3];                            //For dimensions of Cartesian topology
    int PX, PY, PZ;                         //X dim, Y dim, Z dim of each process
    float ***old, ***new, ***temp;          //Matrices for results dimensions is (Px+2)*(PY+2)*(PZ+2)
    int period[3];                          //Periodicity for each dimension
    int reorder;                            //Whether processes should be reordered in new cartesian topology
    int ndims;                              //Number of dimensions (which is 3)
    int Z_TOWARDS_U, Z_AWAY_U;              //Z neighbour towards you and away from you (Z const)
    int X_DOWN, X_UP;                       //Below plane and above plane (X const)
    int Y_LEFT, Y_RIGHT;                    //Left plane and right plane (Y const)
    int coords[3];                          //Finding coordinates of processes
    int dimension;                          //Used in MPI_Cart_shift() , values = 0, 1,2 
    int displacement;                       //Used in MPI_Cart_shift(), values will be +1 to find immediate neighbours
    float l_max_err;                        //Local maximum error on process
    float l_max_err_new;                    //For dependent faces. 
    float G_max_err = 1.0;                  //Maximum error for stopping criterion
    int iterations = 0 ;                    //Counting number of iterations 
    MPI_Request send[6], recv[6];           //For MPI_Isend and MPI_Irecv               
    int start[3];                           //Start will be defined in MPI_Isend() and MPI_Irecv()
    int gsize[3];                           //Defining global size of subarray  
    MPI_Datatype x_subarray;                //For sending X_UP and X_DOWN
    int local_x[3];                         //Defining local plane size for X_UP/X_DOWN
    MPI_Datatype y_subarray;                //For sending Y_LEFT and Y_RIGHT
    int local_y[3];                         //Defining local plane for Y_LEFT/Y_RIGHT
    MPI_Datatype z_subarray;                //For sending Z_TOWARDS_U and Z_AWAY_U
    int local_z[3];                         //Defining local plan size for XY plane i.e. where Z=0

    double strt, end;                                   //For measuring time
    double strt1, end1, delta1;                         //For measuring trivial time 1
    double strt2, end2, delta2;                         //For measuring trivial time 2  
    double t_i_strt, t_i_end, t_i_sum=0;                            //Time for independent computational kernel
    double t_up_strt, t_up_end, t_up_sum=0;                         //Time for X_UP
    double t_down_strt, t_down_end, t_down_sum=0;                   //Time for X_DOWN
    double t_left_strt, t_left_end, t_left_sum=0;                   //Time for Y_LEFT
    double t_right_strt, t_right_end, t_right_sum=0;                //Time for Y_RIGHT
    double t_towards_strt, t_towards_end, t_towards_sum=0;          //For Z_TOWARDS_U   
    double t_away_strt, t_away_end, t_away_sum=0;                   //For Z_AWAY_U
    double t_comm_strt, t_comm_end, t_comm_sum=0;                   //Time comm + independent update (need to subtract to get comm time)
    double t_setup_strt,t_setup_end;                                //Set-up start and end time
    double t_allred_strt,t_allred_end,t_allred_total=0.0;           //Measuring Allreduce time separately.

    int threadID;                           //ID of a thread
    int nthreads;                           //Total threads in OpenMP region
    int chunk;                              //chunk - used to calculate iterations of a thread
    /* IGNORE MPI STARTUP ETC */
    MPI_Init(&argc, &argv);

    t_setup_strt = MPI_Wtime();

    old_comm = MPI_COMM_WORLD;
    MPI_Comm_size(old_comm, &size);
    MPI_Comm_rank(old_comm, &rank);
    P = size; 


    if(rank == 0)
    {
        input(&N[0], &N[1], &N[2]);
    }

    MPI_Bcast(N, 3, MPI_INT, 0, old_comm); 

    dims[0] = 0;
    dims[1] = 0;
    dims[2] = 0; 

    period[0] = period[1] = period[2] = 0;          //All dimensions aperiodic
    reorder = 0 ;                                   //No reordering of ranks in new_comm
    ndims = 3; 
    MPI_Dims_create(P,ndims,dims);
    MPI_Cart_create(old_comm, ndims, dims, period, reorder, &new_comm);



    if( (N[0]-1) % dims[0] == 0 && (N[1]-1) % dims[1] == 0 && (N[2]-1) % dims[2] == 0 )
    {

        PX = (N[0]-1)/dims[0];                      //Rows of unknowns each process gets
        PY = (N[1]-1)/dims[1];                      //Columns of unknowns each process gets
        PZ = (N[2]-1)/dims[2];                      //Depth of unknowns each process gets
    }


    old = allocate_mem(PX+2, PY+2, PZ+2);       //3D arrays with ghost points
    new = allocate_mem(PX+2, PY+2, PZ+2);       //3D arrays with ghost points


    dimension = 0; 
    displacement = 1; 

    MPI_Cart_shift(new_comm, dimension, displacement, &X_UP, &X_DOWN); //Find UP and DOWN neighbours

    dimension = 1; 
    MPI_Cart_shift(new_comm, dimension, displacement, &Y_LEFT, &Y_RIGHT); //Find UP and DOWN neighbours

    dimension = 2;
    MPI_Cart_shift(new_comm, dimension, displacement, &Z_TOWARDS_U, &Z_AWAY_U); //Find UP and DOWN neighbours



/* IGNORE BOUNDARY SETUPS FOR PDE */
#pragma omp parallel for default(none) shared(old,new,PX,PY,PZ) private(i,j,k) schedule(static)
    for(i = 0; i <= PX+1; i++)
    {
        for(j = 0; j <= PY+1; j++)
        {
            for(k = 0; k <= PZ+1; k++)
            {
                old[i][j][k] = 0.0;
                new[i][j][k] = 0.0; 
            }
        }
    }



#pragma omp parallel default(none) shared(X_DOWN,X_UP,Y_LEFT,Y_RIGHT,Z_TOWARDS_U,Z_AWAY_U,old,new,PX,PY,PZ) private(i,j,k,threadID,nthreads)
{   
    threadID = omp_get_thread_num();
    nthreads = omp_get_num_threads();

if(threadID == 0)
 {  
    if(X_DOWN == MPI_PROC_NULL)                         //X is constant here, this is YZ upper plane
    {
        for(j = 1 ; j<= PY ; j++)
            for(k = 1 ; k<= PZ ; k++)
            {
                old[0][j][k] = 1;
                new[0][j][k] = 1;                       //Set boundaries in new also
            }   
    }
}

if(threadID == (nthreads-1))
{
    if(X_UP == MPI_PROC_NULL)                           //YZ lower plane
    {
        for(j = 1 ; j<= PY ; j++)
            for(k = 1; k<= PZ ; k++)
            {
                old[PX+1][j][k] = 1;
                new[PX+1][j][k] = 1;
            }
    }
}


    if(Y_LEFT == MPI_PROC_NULL)                         //Y is constant, this is left XZ plane, possibly can use collapse(2) 
    {
    #pragma omp for schedule(static)
        for(i = 1 ; i<= PX ; i++)
            for(k = 1; k<= PZ; k++)
            {
                old[i][0][k] = 1;
                new[i][0][k] = 1; 
            }
    }

    if(Y_RIGHT == MPI_PROC_NULL)                        //XZ right plane, again collapse(2) potential 
    {
    #pragma omp for schedule(static)
        for(i = 1 ; i<= PX; i++)
            for(k = 1; k<= PZ ; k++)
            {
                old[i][PY+1][k] = 1;
                new[i][PY+1][k] = 1; 
            }   
    }

    if(Z_TOWARDS_U == MPI_PROC_NULL)                    //Z is constant here, towards you XY plane, collapse(2)
    {
     #pragma omp for schedule(static)
        for(i = 1 ; i<= PX ; i++)
            for(j = 1; j<= PY  ; j++)
            {
                old[i][j][0] = 1;
                new[i][j][0] = 1; 
            }
    }

    if(Z_AWAY_U == MPI_PROC_NULL)                       //Away from you XY plane, collapse(2) 
    {
    #pragma omp for schedule(static)
        for(i = 1 ; i<= PX; i++)
            for(j = 1; j<= PY ; j++)
            {
                old[i][j][PZ+1] = 1;
                new[i][j][PZ+1] = 1; 
            }   
    }

}

    /* IGNORE SUBARRAY DECLARATION */
    gsize[0] = PX+2;                    //Global sizes of 3-D cubes for each process
    gsize[1] = PY+2;
    gsize[2] = PZ+2;

    start[0] = 0;                       //Will specify starting location while sending/receiving 
    start[1] = 0;
    start[2] = 0;


    local_x[0] = 1;
    local_x[1] = PY;
    local_x[2] = PZ;

    MPI_Type_create_subarray(ndims, gsize, local_x, start, MPI_ORDER_C, MPI_FLOAT, &x_subarray);
    MPI_Type_commit(&x_subarray); 


    local_y[0] = PX;
    local_y[1] = 1;
    local_y[2] = PZ;

    MPI_Type_create_subarray(ndims, gsize, local_y, start, MPI_ORDER_C, MPI_FLOAT, &y_subarray);
    MPI_Type_commit(&y_subarray); 


    local_z[0] = PX;
    local_z[1] = PY;
    local_z[2] = 1;

    MPI_Type_create_subarray(ndims, gsize, local_z, start, MPI_ORDER_C, MPI_FLOAT, &z_subarray);
    MPI_Type_commit(&z_subarray); 

    t_setup_end = MPI_Wtime(); 

    strt = MPI_Wtime();

while(G_max_err > Tol) //iterations < ITERATIONS)
{

    iterations++ ; 


    t_comm_strt = MPI_Wtime();

        /* IGNORE MPI COMMUNICATION */
        MPI_Irecv(&old[0][1][1], 1, x_subarray, X_DOWN, 10, new_comm, &recv[0]);    
        MPI_Irecv(&old[PX+1][1][1], 1, x_subarray, X_UP, 20, new_comm, &recv[1]);
        MPI_Irecv(&old[1][PY+1][1], 1, y_subarray, Y_RIGHT, 30, new_comm, &recv[2]);
        MPI_Irecv(&old[1][0][1], 1, y_subarray, Y_LEFT, 40, new_comm, &recv[3]); 
        MPI_Irecv(&old[1][1][PZ+1], 1, z_subarray, Z_AWAY_U, 50, new_comm, &recv[4]);
        MPI_Irecv(&old[1][1][0], 1, z_subarray, Z_TOWARDS_U, 60, new_comm, &recv[5]);
        MPI_Isend(&old[PX][1][1], 1, x_subarray, X_UP, 10, new_comm, &send[0]);
        MPI_Isend(&old[1][1][1], 1, x_subarray, X_DOWN, 20, new_comm, &send[1]);
        MPI_Isend(&old[1][1][1], 1, y_subarray, Y_LEFT, 30, new_comm, &send[2]);
        MPI_Isend(&old[1][PY][1], 1, y_subarray, Y_RIGHT, 40, new_comm, &send[3]);
        MPI_Isend(&old[1][1][1], 1, z_subarray, Z_TOWARDS_U, 50, new_comm, &send[4]);
        MPI_Isend(&old[1][1][PZ], 1, z_subarray, Z_AWAY_U, 60, new_comm, &send[5]);
        MPI_Waitall(6, send, MPI_STATUSES_IGNORE);
        MPI_Waitall(6, recv, MPI_STATUSES_IGNORE);

    t_comm_end = MPI_Wtime();
    t_comm_sum = t_comm_sum + (t_comm_end - t_comm_strt); 



/* Use threads in Independent update */

    t_i_strt = MPI_Wtime();
    l_max_err = 0.0;                        //Very important, Reduction result is combined with this !

/* THIS IS THE IMPORTANT REGION */
    #pragma omp parallel default(none) shared(old,new,PX,PY,PZ,chunk) private(threadID,nthreads) reduction(max:l_max_err)
    {   
        nthreads = omp_get_num_threads();
        threadID = omp_get_thread_num();

        chunk = (PX-1+1) / nthreads ; 

        l_max_err = independent_update(old, new, PX+2, PY+2, PZ+2, threadID, chunk); 

    }

    t_i_end = MPI_Wtime();
    t_i_sum = t_i_sum + (t_i_end - t_i_strt) ; 

    /* IGNORE THE REMAINING CODE */
    t_allred_strt = MPI_Wtime();
    MPI_Allreduce(&l_max_err, &G_max_err, 1, MPI_FLOAT, MPI_MAX, new_comm);
    t_allred_end = MPI_Wtime();
    t_allred_total = t_allred_total + (t_allred_end - t_allred_strt);


    temp = new ;
    new = old;
    old = temp;         
    }


    MPI_Barrier(new_comm);
    end = MPI_Wtime();

    if( rank == 0)
    {
        printf("\nIterations = %d, G_max_err = %f", iterations, G_max_err);
        printf("\nThe total SET-UP time for MPI and boundary conditions is %lf", (t_setup_end-t_setup_strt));
        printf("\nThe total time for SOLVING is %lf", (end-strt));
        printf("\nThe total time for INDEPENDENT COMPUTE %lf", t_i_sum);
        printf("\nThe total time for COMMUNICATION OVERHEAD is %lf", t_comm_sum);
        printf("\nThe total time for MPI_ALLREDUCE() is %lf", t_allred_total);
    }


    MPI_Type_free(&x_subarray);
    MPI_Type_free(&y_subarray);
    MPI_Type_free(&z_subarray);

    free(&old[0][0][0]);
    free(&new[0][0][0]); 

    MPI_Finalize();
    return 0;  

}

P.S。：我几乎可以肯定，产生/唤醒线程的成本并不是时间上如此巨大差异的原因。请查找随附的Scalasca快照，了解混合程序的独立计算。

使用循环simd构造

#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,l_max_err) private(i,j,k,diff)
    { 
        #pragma omp for simd  schedule(static) reduction(max:l_max_err) 
        for(i = 1; i <= PX ; i++)
            {
                for(j = 1; j<= PY; j++)
                {
                    for(k = 1; k<= PZ; k++)
                    {
                        new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
                        diff = 1.0 - new[i][j][k]; 
                        diff = (diff > 0 ? diff : -1.0 * diff ); 
                        if(diff > l_max_err)
                            l_max_err = diff;
                    }
                }
            }
    }

混合MPI + OpenMP与MPI性能

0 个答案: