Question

Donzis＆amp; Aditya建议，可以使用可能在模板上有延迟的有限差分方案。这是什么意思？ FD方案可用于解决热方程并读取（或简化它）

u[t+1,i] = u[t,i] + c (u[t,i-1]-u[t,i+1])

意思是，下一个时间步的值取决于同一位置的值及其前一时间步的邻居。

通过将（在我们的情况下为1D）域分割到不同的处理器上，可以很容易地解决这个问题。但是，我们需要在处理器上计算边界节点时进行通信，因为元素u[t,i+-1]仅在另一个处理器上可用。

下图中说明了问题，该图取自引用的论文。

enter image description here

MPI实现可能使用MPI_Send和MPI_Recv进行同步计算。由于计算本身相当容易，因此通信可能成为可能的瓶颈。

该问题的解决方案在论文中给出：

只需获取可用的边界音符，而不是同步过程，尽管它可能是较早时间步长的值。然后该方法仍然收敛（在某些假设下）

对于我的工作，我想实现异步MPI案例（这不是本文的一部分）。使用MPI_Send和MPI_Recv的同步部分正常运行。我将内存扩展了两个元素作为相邻元素的ghost单元格，并通过发送和接收发送所需的值。下面的代码基本上是上图的实现，并在计算之前的每个时间步骤中执行。

MPI_Send(&u[NpP],1,MPI_DOUBLE,RIGHT,rank,MPI_COMM_WORLD);
MPI_Recv(&u[0],1,MPI_DOUBLE,LEFT,LEFT,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

MPI_Send(&u[1],1,MPI_DOUBLE,LEFT,rank,MPI_COMM_WORLD);
MPI_Recv(&u[NpP+1],1,MPI_DOUBLE,RIGHT,RIGHT,MPI_COMM_WORLD,MPI_STATUS_IGNORE);

现在，我绝不是MPI专家。我发现，MPI_Put可能是我对异步情况所需要的并且稍微阅读一下，我想出了以下实现。

在时间循环之前：

MPI_Win win;
double *boundary;
MPI_Alloc_mem(sizeof(double) * 2, MPI_INFO_NULL, &boundary);
MPI_Info info;
MPI_Info_create(&info);
MPI_Info_set(info,"no_locks","true");
MPI_Win_create(boundary, 2*sizeof(double), sizeof(double), info, MPI_COMM_WORLD, &win);

在时间循环中：

MPI_Put(&u[1],1,MPI_DOUBLE,LEFT,1,1,MPI_DOUBLE,win);
MPI_Put(&u[NpP],1,MPI_DOUBLE,RIGHT,0,1,MPI_DOUBLE,win);
MPI_Win_fence(0,win);
u[0] = boundary[0];
u[NpP+1] = boundary[1];

将所需元素放在窗口中，即boundary（包含两个元素的数组）在相邻处理器上，并从u[0]获取值u[NpP+1]和boundary数组本身。这个实现正在运行，我得到的结果与MPI_Send/Recv相同。但是，这并非真正异步，因为我仍在使用MPI_Win_fence，据我所知，这可以确保同步。

问题是：如果我取出MPI_Win_fence，boundary内的值永远不会更新并保持初始值。我的理解是，如果没有MPI_Win_fence，您将获取boundary内可用（或可能不会）由相邻处理器更新的任何值。

是否有人有想法避免使用MPI_Win_fence同时解决问题，boundary内的值永远不会更新？

我也不确定，如果我提供的代码足以理解我的问题或提供任何提示。如果是这种情况，请随意询问，因为我将尝试添加所有缺失的部分。

Answer 1

以下工作似乎对我有用，在正确执行的意义上 - 从我们的一个教程中获取的一个小的1d热方程，用于RMA的东西：

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
MPI_Win_unlock( left, rightwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
MPI_Win_unlock( right, leftwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
temperature[current][0]           = *leftgc;
MPI_Win_unlock( rank, leftwin );

MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
temperature[current][locpoints+1] = *rightgc;
MPI_Win_unlock( rank, rightwin );

在代码中我甚至排名每次等待额外的10ms，以确保事情不同步;但看着痕迹，看起来事情看起来仍然非常同步。我不知道是否可以通过调整代码来修复高度同步，或者是对实现的限制（IntelMPI 5.0.1），或者只是因为计算中传递的时间太少而且通信时间太短而发生占主导地位（但至于最后，启动usleep间隔似乎没有影响）。

#define _BSD_SOURCE     /* usleep */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>


int main(int argc, char **argv) {
    /* simulation parameters */
    const int totpoints=1000;
    int locpoints;
    const float xleft = -12., xright = +12.;
    float locxleft, locxright;
    const float kappa = 1.;

    const int nsteps=100;

    /* data structures */
    float *x;
    float **temperature;

    /* parameters of the original temperature distribution */
    const float ao=1., sigmao=1.;

    float fixedlefttemp, fixedrighttemp;

    int current, new;
    int step, i;
    float time;
    float dt, dx;
    float rms;

    int rank, size;
    int start,end;
    int left, right;
    int lefttag=1, righttag=2;

    /* MPI Initialization */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    locpoints = totpoints/size;
    start = rank*locpoints;
    end   = (rank+1)*locpoints - 1;
    if (rank == size-1)
        end = totpoints-1;
    locpoints = end-start+1;

    left = rank-1;
    if (left < 0) left = MPI_PROC_NULL;
    right= rank+1;
    if (right >= size) right = MPI_PROC_NULL;

    #ifdef ONESIDED
    if (rank == 0)
        printf("Onesided: Allocating windows\n");
    MPI_Win leftwin, rightwin;
    float *leftgc, *rightgc;
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &leftgc,  &leftwin);
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &rightgc, &rightwin);
    #endif
    /* set parameters */

    dx = (xright-xleft)/(totpoints-1);
    dt = dx*dx * kappa/10.;

    locxleft = xleft + start*dx;
    locxright = xleft + end*dx;

    x      = (float *)malloc((locpoints+2)*sizeof(float));
    temperature = (float **)malloc(2 * sizeof(float *));
    temperature[0] = (float *)malloc((locpoints+2)*sizeof(float));
    temperature[1] = (float *)malloc((locpoints+2)*sizeof(float));
    current = 0;
    new = 1;

    /* setup initial conditions */

    time = 0.;
    for (i=0; i<locpoints+2; i++) {
        x[i] = locxleft + (i-1)*dx;
        temperature[current][i] = ao*exp(-(x[i]*x[i]) / (2.*sigmao*sigmao));
    }
    fixedlefttemp = ao*exp(-(locxleft-dx)*(locxleft-dx) / (2.*sigmao*sigmao));
    fixedrighttemp= ao*exp(-(locxright+dx)*(locxright+dx)/(2.*sigmao*sigmao));
    #ifdef ONESIDED
    *leftgc  = fixedlefttemp;
    *rightgc = fixedrighttemp;
    #endif

    /* evolve */
    for (step=0; step < nsteps; step++) {
        /* boundary conditions: keep endpoint temperatures fixed. */

        #ifdef ONESIDED
            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, left, 0, rightwin );
            MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
            MPI_Win_unlock( left, rightwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, right, 0, leftwin );
            MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
            MPI_Win_unlock( right, leftwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, leftwin );
            temperature[current][0]           = *leftgc;
            MPI_Win_unlock( rank, leftwin );

            MPI_Win_lock( MPI_LOCK_EXCLUSIVE, rank, 0, rightwin );
            temperature[current][locpoints+1] = *rightgc;
            MPI_Win_unlock( rank, rightwin );
        #else
            temperature[current][0] = fixedlefttemp;
            temperature[current][locpoints+1] = fixedrighttemp;

            /* send data rightwards */
            MPI_Sendrecv(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, righttag,
                         &(temperature[current][0]), 1, MPI_FLOAT, left,  righttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

            /* send data leftwards */
            MPI_Sendrecv(&(temperature[current][1]), 1, MPI_FLOAT, left, lefttag,
                         &(temperature[current][locpoints+1]), 1, MPI_FLOAT, right,  lefttag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        #endif

        for (i=1; i<locpoints+1; i++) {
            temperature[new][i] = temperature[current][i] + dt*kappa/(dx*dx) *
                (temperature[current][i+1] - 2.*temperature[current][i] +
                 temperature[current][i-1]) ;
        }

        time += dt;

        if ((rank % 2) == 0)
            usleep(10000u);

        current = new;
        new = 1 - current;
    }

    rms  = 0.;
    for (i=1;i<locpoints+1;i++) {
        rms += (temperature[current][i])*(temperature[current][i]);
    }
    float totrms;
    MPI_Reduce(&rms, &totrms, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        totrms = sqrt(totrms/totpoints);
        printf("Step = %d, Time = %g, RMS value = %g\n", step, time, totrms);
    }


    #ifdef ONESIDED
    MPI_Win_free(&leftwin);
    MPI_Win_free(&rightwin);
    #endif

    free(temperature[1]);
    free(temperature[0]);
    free(temperature);
    free(x);

    MPI_Finalize();
    return 0;
}

Answer 2

这是Jonathen Dursi的帖子的副本，但改变了MPI-3 RMA同步......

#define _BSD_SOURCE     /* usleep */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>


int main(int argc, char **argv) {
    /* simulation parameters */
    const int totpoints=1000;
    int locpoints;
    const float xleft = -12., xright = +12.;
    float locxleft, locxright;
    const float kappa = 1.;

    const int nsteps=100;

    /* data structures */
    float *x;
    float **temperature;

    /* parameters of the original temperature distribution */
    const float ao=1., sigmao=1.;

    float fixedlefttemp, fixedrighttemp;

    int current, new;
    int step, i;
    float time;
    float dt, dx;
    float rms;

    int rank, size;
    int start,end;
    int left, right;
    int lefttag=1, righttag=2;

    /* MPI Initialization */
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    locpoints = totpoints/size;
    start = rank*locpoints;
    end   = (rank+1)*locpoints - 1;
    if (rank == size-1)
        end = totpoints-1;
    locpoints = end-start+1;

    left = rank-1;
    if (left < 0) left = MPI_PROC_NULL;
    right= rank+1;
    if (right >= size) right = MPI_PROC_NULL;

    #ifdef ONESIDED
    if (rank == 0)
        printf("Onesided: Allocating windows\n");
    MPI_Win leftwin, rightwin;
    float *leftgc, *rightgc;
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &leftgc,  &leftwin);
    MPI_Win_allocate(sizeof(float), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &rightgc, &rightwin);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, leftwin);
    MPI_Win_lock_all(MPI_MODE_NOCHECK, rightwin);
    #endif
    /* set parameters */

    dx = (xright-xleft)/(totpoints-1);
    dt = dx*dx * kappa/10.;

    locxleft = xleft + start*dx;
    locxright = xleft + end*dx;

    x      = (float *)malloc((locpoints+2)*sizeof(float));
    temperature = (float **)malloc(2 * sizeof(float *));
    temperature[0] = (float *)malloc((locpoints+2)*sizeof(float));
    temperature[1] = (float *)malloc((locpoints+2)*sizeof(float));
    current = 0;
    new = 1;

    /* setup initial conditions */

    time = 0.;
    for (i=0; i<locpoints+2; i++) {
        x[i] = locxleft + (i-1)*dx;
        temperature[current][i] = ao*exp(-(x[i]*x[i]) / (2.*sigmao*sigmao));
    }
    fixedlefttemp = ao*exp(-(locxleft-dx)*(locxleft-dx) / (2.*sigmao*sigmao));
    fixedrighttemp= ao*exp(-(locxright+dx)*(locxright+dx)/(2.*sigmao*sigmao));
    #ifdef ONESIDED
    *leftgc  = fixedlefttemp;
    *rightgc = fixedrighttemp;
    #endif

    /* evolve */
    for (step=0; step < nsteps; step++) {
        /* boundary conditions: keep endpoint temperatures fixed. */

        /* RMA code assumes no conflicts in updates via MPI_Put.
           If that is wrong, hopefully it is fine to use MPI_Accumulate
           with MPI_SUM to accumulate the result. */
        #ifdef ONESIDED
            MPI_Put(&(temperature[current][1]),         1, MPI_FLOAT, left,  0, 1, MPI_FLOAT, rightwin);
            MPI_Win_flush( left, rightwin );

            MPI_Put(&(temperature[current][locpoints]), 1, MPI_FLOAT, right, 0, 1, MPI_FLOAT, leftwin);
            MPI_Win_flush( right, leftwin );

            temperature[current][0]           = *leftgc;
            MPI_Win_flush( rank, leftwin );

            temperature[current][locpoints+1] = *rightgc;
            MPI_Win_flush( rank, rightwin );
        #else
        #error Define ONESIDED...
        #endif

        for (i=1; i<locpoints+1; i++) {
            temperature[new][i] = temperature[current][i] + dt*kappa/(dx*dx) *
                (temperature[current][i+1] - 2.*temperature[current][i] +
                 temperature[current][i-1]) ;
        }

        time += dt;

        if ((rank % 2) == 0)
            usleep(10000u);

        current = new;
        new = 1 - current;
    }

    rms  = 0.;
    for (i=1;i<locpoints+1;i++) {
        rms += (temperature[current][i])*(temperature[current][i]);
    }
    float totrms;
    MPI_Reduce(&rms, &totrms, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        totrms = sqrt(totrms/totpoints);
        printf("Step = %d, Time = %g, RMS value = %g\n", step, time, totrms);
    }


    #ifdef ONESIDED
    MPI_Win_unlock_all(leftwin);
    MPI_Win_unlock_all(rightwin);
    MPI_Win_free(&leftwin);
    MPI_Win_free(&rightwin);
    #endif

    free(temperature[1]);
    free(temperature[0]);
    free(temperature);
    free(x);

    MPI_Finalize();
    return 0;
}

使用MPI_Put的异步有限差分格式

2 个答案: