Question

我是MPI的新手。试图大致解决PDE。 1000 x 1000阵列。除了第一行和最后一行之外，在每次迭代时，每个元素都被更新为其8个邻居的平均值。

我的代码运行，但在使用不同数量的处理器的第三个小数位上的结果略有不同。我猜我的沟通正在失去精确度？我按行分割大数组，因为C ++逐行存储数组。

这是代码。

#include <iostream>
#include <mpi.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, const char * argv[])
{

    // Initialize the MPI environment
    MPI_Init(NULL,NULL);
    int p;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    int id;
    MPI_Comm_rank(MPI_COMM_WORLD, &id);

    const double pi = 3.1415926;

    int n;
    n=atoi(argv[argc-1]);

    //calculate the starting and ending column indices
    int m=floor((n-2)/p);
    int r=n-2-m*p;

    //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise
    //therefore n-2=(m+1)*r+m*(p-r)
    //the first r processors get m+1 rows, the rest get m rows

    //starting row, ending row in the original A, width of matrix
    int start_row, end_row, width;
    if (id<=r-1){
        start_row=1+id*(m+1);
        end_row=start_row+m;
        width=m+1;
    }
    else {
        start_row=1+r*(m+1)+(id-r)*m;
        end_row=start_row+m-1;
        width=m;
    }

    //printf("mpi debug 1");
    printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row);

    //id of the processor before and after
    //id_before is not significant for id==0
    //id_after is not
    int id_before, id_after;
    if (id==0)
        id_before=p-1;
    else
        id_before=id-1;

    if (id==p-1)
        id_after=0;
    else
        id_after=id+1;

    //printf("debug000");


    //initialize the local matrix
    //**** better way to initialize?
    double a[width][n], b[width][n];
    for (int i=0; i<width; i++)
        for (int j=0; j<n; j++)
            a[i][j]=0.5;

    //two 1d arrays to store the halo cells
    double halo_before[n], halo_after[n];

    if (id==0){
        for (int j=0; j<n; j++){
            halo_before[j]=0.0;
        }
    }

    if (id==p-1){
        for (int j=0; j<n; j++) {
            halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n));
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl;

    //set id=0 to be the root processor and call
    double start_time, end_time;
    if (id==0)
        start_time=MPI_Wtime();

    MPI_Status status_before, status_after;
    MPI_Request request_before, request_after;

    /////////////////////////////////////////////////////////
    //to dubug, print out arrays
    //std::cout<< " the array on processor " << id <<" to start is \n" << std::endl;

    //for (int i=0; i<width; i++){
    //    for (int j=0; j<n; j++){
    //        std::cout << a[i][j] << " ";
    //        if (j==n-1)
    //            std::cout << "\n" <<std::endl;
    //    }
    //}

    //to debug print out halos
    //std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl;
    //for (int i=0;i<n;i++){
    //    std::cout << halo_before[i] << " ";
    //    if (i==n-1)
    //        std::cout <<"\n" <<std::endl;
    //}

    //std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl;
    //for (int i=0;i<n;i++){
    //    std::cout << halo_after[i] << " ";
    //    if (i==n-1)
    //        std::cout <<"\n" <<std::endl;
    //}
    //////////////////////////////////////////////////////

    //begin iteration
    for (int t=0; t<500; t++){
        //unblocking send

        //send first row to id_before:
        //how should I use tag?
        if (id>0){
            MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before);
        }
        if (id<p-1){
            //send the last row to id_after
            MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after);
        }

        //printf("dubug0");

        //update the entries that do not depend on halos
        //local row=1 to row=width-2
        //add if (width>3)??
        int j_b, j_a;
        for (int i=1; i<width-1; i++){
            for (int j=0; j<n; j++){
                j_b=(n+j-1)%n;
                j_a=(j+1)%n;
                b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8;
            }
        }

        //printf("dubug1");

        //blocking receive
        //may consider unblocking receive
        //receive from id_before and store in halo_before
        //not sure about status
        if (id>0){
            MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
        //receive from id_after and store in halo_after
        if (id<p-1){
            MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }

        //to debug print out halos
        //std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl;
        //for (int i=0;i<n;i++){
        //    std::cout << halo_before[i] << " ";
        //    if (i==n-1)
        //        std::cout <<"\n" <<std::endl;
        //}

        //std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl;
        //for (int i=0;i<n;i++){
        //    std::cout << halo_after[i] << " ";
        //    if (i==n-1)
        //        std::cout <<"\n" <<std::endl;
        //}


        //update entries that depend on halos
        //bugs here, what if width==1???
        if (width==1){
            for (int j=0; j<n; j++){
                j_a=(n+j-1)%n;
                j_b=(j+1)%n;
                b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8;
            }

        }
        else{
        for (int j=0; j<n; j++){
            j_a=(n+j-1)%n;
            j_b=(j+1)%n;
            b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8;
            b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
        }
        }

        //copy to b
        //but make sure the send have been completed

        if (id>0)
            MPI_Wait(&request_before,MPI_STATUS_IGNORE);

        if (id<p-1)
            MPI_Wait(&request_after,MPI_STATUS_IGNORE);


        for (int i=0; i<width; i++)
            for (int j=0; j<n; j++)
                a[i][j]=b[i][j];

        //to dubug, print out arrays
        //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl;

        //for (int i=0; i<width; i++){
        //    for (int j=0; j<n; j++){
        //    std::cout << a[i][j] << " ";
        //    if (j==n-1)
        //        std::cout << "\n" <<std::endl;
        //    }
        //}


    }

    //calculate the sum
    double sum=0.0;
    for (int i=0; i<width; i++)
        sum += a[i][i+start_row];

    double total_sum;
    //send to root processor
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    if (id==0){
        end_time=MPI_Wtime();
        //double sum_receive[p];
        //double sum_calc;
        //for (int i=0; i<p; i++){
        //    MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        //    sum_calc += sum_receive[i];
        //}

        printf("time elapse is %f \n", end_time-start_time);
        printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n)));
    }


    MPI_Finalize();

    return 0;


}

Answer 1

你的一行代码中有一个简单的拼写错误。这样：

        b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

应为此（请注意第4个术语; j_a，而不是第二个j_b）：

        b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

因为这发生在每个域的最后一行，所导致的确切错误数量取决于域边界 - 例如，您拥有多少处理器。

现在原因这样的错误在您发布的代码中基本上是不可避免的，即相同的计算 - 来自a - 通过一些修改发生不少于3次。这是一颗滴答作响的定时炸弹;最终会有一个更新，其他人会变得与第一个不一致，或者对其他人的更新最终会出现一些错误。

这里有许多减少复制量的方法，包括澄清代码和避免这些错误。最好是将光晕合并到a和b阵列中，在数据之前和之后添加额外的行以包含数据 - 这样你就不必担心宽度是否= = 1并且处理分别结束行。另外，定义一个函数，它根据a更新b的行或元素，并使用该函数而不是重复代码。

以下是清理代码的示例，该代码将位封装到例程中，与包含的光环区域一致地处理n和宽度等。

#include <mpi.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

int min2i(int a, int b) {
    int result = a;
    if (b < a) result = b;
    return result;
}

void decomposition(const int n, const int nprocs, const int id,
                   int *start_row, int *width, int *id_before, int *id_after) {
    const int nrows = n;
    const int m = nrows/nprocs;
    const int r = nrows % nprocs;

    *width = m;
    if (id < r) (*width)++;

    *start_row = 1 + id*m + min2i(id,r);

    *id_before = (id > 0 ? id-1 : MPI_PROC_NULL);
    *id_after  = (id < nprocs-1 ? id+1 : MPI_PROC_NULL);
}

void startBC(const int n, const int width, double a[][n+2], double b[][n+2],
             const int id_before, const int id_after, const int t, MPI_Request *req) {
    MPI_Isend(&a[1][1],     n, MPI_DOUBLE, id_before, 2*t  , MPI_COMM_WORLD, &req[0]);
    MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]);
}

void finishBC(const int n, const int width, double a[][n+2], double b[][n+2],
              const int id_before, const int id_after, const int t, MPI_Request *req) {
    MPI_Status stats[2];

    MPI_Recv(&a[0][1],       n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]);
    MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after,  2*t  , MPI_COMM_WORLD, &stats[1]);

    for (int i=0; i<width+2; i++) {
        a[i][0]   = a[i][n];
        a[i][n+1] = a[i][1];
    }

    MPI_Waitall(2, req, stats);
}

void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) {
    for (int j=1; j<=n; j++)
        b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1]
                   +a[ row ][j-1]               + a[ row ][j+1]
                   +a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8;
}

int main(int argc, const char * argv[])
{
    int p, id;
    MPI_Init(NULL,NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &id);

    const double pi = 3.1415926;

    int n=atoi(argv[argc-1]);
    int width, start_row, id_before, id_after;

    decomposition(n, p, id, &start_row, &width, &id_before, &id_after);

    double a[width+2][n+2], b[width+2][n+2];
    for (int i=0; i<width+2; i++)
        for (int j=0; j<n+2; j++)
            a[i][j]=0.5;

    if (id==p-1)
        for (int j=0; j<n+2; j++)
            a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n));

    if (id==0)
        for (int j=0; j<n+2; j++)
            a[0][j]=0.;

    double start_time, end_time;
    if (id==0)
        start_time=MPI_Wtime();

    MPI_Request reqs[2];

    //begin iteration
    for (int t=0; t<2; t++){
        startBC(n, width, a, b, id_before, id_after, t, reqs);
        /* interior rows */
        for (int row=2; row<width; row++)
            updateRow(n, width, a, b, row);

        finishBC(n, width, a, b, id_before, id_after, t, reqs);

        /* boundary rows */
        updateRow(n, width, a, b, 1);
        updateRow(n, width, a, b, width);

        for (int i=1; i<width+1; i++)
            for (int j=1; j<n+1; j++)
                a[i][j]=b[i][j];
    }

    //calculate the sum
    double sum=0.0;
    for (int i=1; i<width+1; i++)
        sum += a[i][i+start_row-1];

    double total_sum;
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);

    if (id==0){
        end_time=MPI_Wtime();

        printf("time elapse is %f \n", end_time-start_time);
        printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n)));
    }


    MPI_Finalize();

    return 0;
}

不同数量的处理器最终会得到不同的结果？

1 个答案: