Question

/***every function is working correct but after only first iteration is giving collective abort anyone can tell what is or coulde be the reason***/
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"

const double tolerance = 0.00001;
const int maxit = 10000;

void MPE_decomp1d(int n, int size, int id, int *s, int *e)
{
    /*****calculating start and end row for every process*****/
    *s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
        *e = *s + (n/size)+((n%size)>id);
}

void onedinit(double **a, double **b, double **f, const int nx, const int s, const int e)
{
    int i, j;
    int ls, le;
    ls = s - (s!=0); 
    le = e + (e!=nx);
    /***setting all the intial values to zero****/
    for (i = ls; i < le; i++)
    {
        for (j = 0; j < nx; j++)
        {
            a[i][j] = b[i][j] = f[i][j] = 0.0;
        }
    }
    //***************************Defining Boundary Condition***********************************//
    /***setting left boundary to 1***/
    for (i = ls; i < le; i++) a[i][0] = b[i][0] = 1;
    /***setting value f(0, i) is 2***/
    if (s==0) for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}   

void exchng1(double **a, const int nx, const int s, const int e, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
    int rank, coord;
    MPI_Status status;
    MPI_Comm_rank(comm1d, &rank);
    MPI_Cart_coords(comm1d, rank, 1, &coord);
    /*****************if process id is odd then first send and if even then first recive to avoid deadlock**********/
    if (coord&1)
    {
        if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);  
        if (nbrtop != -1)    MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
        if (nbrtop != -1)    MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
        if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
    }
    else
    {
        if (nbrtop != -1)    MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
        if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
        if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
        if (nbrtop != -1)    MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
    }
}

void sweep1d(double **a, double **f, int nx, const int s, const int e, double **b)
{
    int i, j;
    int rows;
    rows = e - s - (s==0) - (e==0);
    nx -= 1;
    double h = 1.0 / (double)nx;
    for (i = 1; i <= rows; i++) for (j = 1; j < nx; j++)
        b[i][j] = 0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
    return;
}

double diff(double **a, double **b, const int nx, int s, int e)
{
    double sum = 0.0;
    int i, j;
    int st, ed;
    st = (s!=0);
    ed = e-s+(s!=0);
    for (i = st; i < ed; i++) for (j = 0; j < nx; j++)
        sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
    return sum;
}
int main(int argc, char *argv[])
{
    int nx, ny;
    int myid, root, numprocs, period=0;
    int nbrbottom, nbrtop, s, e, it;

    double diffnorm, dwork;
    double t1, t2;
    double **a, **b, **f;

    root = 0;
    MPI_Comm comm1d;
    MPI_Init(&argc, &argv);;
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

    if(!myid)
    {
        /******for this piece of code nx and ny are assumed to be same please*******/
        printf("Enter the number of cells in X & Y direction\n");
        scanf("%d %d", &nx, &ny);
        nx += 1;
        ny += 1;
        ny = nx; //forced to follow our assumption;
    }
    MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);

    MPI_Comm_rank(comm1d, &myid);
    MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);

    MPE_decomp1d(ny, numprocs, myid, &s, &e);
    int ls, le, rows;
    int i, j;
    ls = s - (s!=0); 
    le = e + (e!=nx);
    rows = le - ls;

    a = (double**)malloc(rows*sizeof(double*));
    b = (double**)malloc(rows*sizeof(double*));
    f = (double**)malloc(rows*sizeof(double*));
    for (i = ls; i < le; i++)
    {
        a[i] = (double*)malloc(nx*sizeof(double));
        b[i] = (double*)malloc(nx*sizeof(double));
        f[i] = (double*)malloc(nx*sizeof(double));
    }
    onedinit(a, b, f, nx, s, e);
    diffnorm = 0.0;
    it = 0;
    do
    {
//      printf("%danshu\n", myid);

        exchng1(a, nx, s, e, comm1d, nbrbottom, nbrtop);
        sweep1d(a, f, nx, s, e, b);
        exchng1(b, nx, s, e, comm1d, nbrbottom, nbrtop);
        sweep1d(b, f, nx, s, e, a);
        dwork = diff(a, b, nx, s, e);

        /************printing matrix a after every iteration******/
        for (i = 0; i < rows; i++) 
        {
            for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
            printf("\n");
        }

        MPI_Barrier(comm1d);

        //printf("%lfhehe\n", dwork);
        MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
        //printf("%dhere\n", myid);
    } 
    while (++it < maxit && diffnorm > tolerance);

    MPI_Finalize();
    return 0;
}

Answer 1

所以只是在SO中倾倒130行代码并询问它为什么不起作用可能不是获得好答案的最佳方式 - 尤其是当您编写的唯一实际编码是“每个函数都在工作”时...在这种情况下，你不会有问题。您需要将事情缩小到更具体的案例并获得更具体的问题。

在这个特殊情况下，我在教学过程中看到过很多像这样的代码，所以看到一些正在发生的事情是可行的。

首先，你不能做这样的事情：

ls = s - (s!=0); 
le = e + (e!=nx);
rows = le - ls;

a = (double**)malloc(rows*sizeof(double*));
/*...*/
for (i = ls; i < le; i++)
{
    a[i] = (double*)malloc(nx*sizeof(double));
    /*...*/
}

如果您有100行分为4个处理器，并且您（比方说）MPI任务2，那么您的s为50且e为75，因此ls将是49而le将是76，所以即使您只分配了大小为27的a[49..76]，您也试图访问a！整个代码都会出现这个特殊错误，需要修复。您想要访问a[0..rows-1]。

顺便说一句，我甚至没有检查MPE_decomp1d是否确实做了正确的事情。我们都经历了这样一个阶段，我们认为它在C中很可爱，通过使用三元运算符等逻辑表达式将事物放在一行中，但严重的是，当其他人必须修复它时，它会使你的代码不必要地解开 - 2个月之后，无论是SOers还是你自己。

在exchng1中，你正在做不必要的工作。您无需检查nbrbottom或nbrtop是否有效;如果不是，则MPI_Cart_shift返回MPI_PROC_NULL，其中发送或接收是无操作。因此，从这些级别发送/接收是无害的，这是一个很好的设计决策，因为它避免了逻辑中的许多极端情况。

同样，为避免死锁，您可以使用MPI_Sendrecv而非个人Send和Recv。加上上述意味着代替：

if (coord&1)
{
    if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);  
    if (nbrtop != -1)    MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
    if (nbrtop != -1)    MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
    if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
    if (nbrtop != -1)    MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
    if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
    if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
    if (nbrtop != -1)    MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}

你可以这样做：

MPI_Sendrecv(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);

- 方式更简单，对吧？

但交换中仍然存在一些问题;接收到a[e-s+1]是不对的，尽管正如我所提到的，我不能解密MPE_decomp1d以找出原因。大概你想收到a[rows-1]。

最后，MPI_Barrier()缓慢且完全不必要;在保护单元交换中有足够的同步（更不用说Allreduce），你不需要它。

完成所有这些更改后，代码将在没有内存访问问题的情况下运行;你必须检查它给出了正确的答案。

#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"

const double tolerance = 0.00001;
const int maxit = 10000;

void MPE_decomp1d(int n, int size, int id, int *rows)
{
    int s, e;
    s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
    e = s + (n/size)+((n%size)>id);
    *rows = e - s - (s==0) - (e==0);
}

void onedinit(double **a, double **b, double **f, const int nx, const int rows, const int id, const int nprocs)
{
    int i, j;

    for (i = 0; i < rows; i++)
    {
        for (j = 0; j < nx; j++)
        {
            a[i][j] = b[i][j] = f[i][j] = 0.0;
        }
    }
    for (i = 0; i < rows; i++) a[i][0] = b[i][0] = 1;

    if (id == 0)
        for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int rows, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
    int rank, coord;
    MPI_Status status;
    MPI_Comm_rank(comm1d, &rank);
    MPI_Cart_coords(comm1d, rank, 1, &coord);

    /* send data downwards */
    MPI_Sendrecv(a[rows-2], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
    /* send data upwards */
    MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[rows-1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}

void sweep1d(double **a, double **f, const int nx, const int rows, double **b)
{
    int i, j;
    double h = 1.0 / (double)nx;
    for (i = 1; i < rows-1; i++) for (j = 1; j < nx-1; j++)
        b[i][j] =  
            0.25 * ( a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
    return;
}

double diff(double **a, double **b, const int nx, const int rows)
{   
    double sum = 0.0;
    int i, j;
    for (i = 0; i < rows; i++) for (j = 0; j < nx; j++)
        sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
    return sum;
}
int main(int argc, char *argv[])
{   
    int nx, ny;
    int myid, root, numprocs, period=0;
    int nbrbottom, nbrtop, it;

    double diffnorm, dwork;

    double **a, **b, **f;

    root = 0;
    MPI_Comm comm1d;
    MPI_Init(&argc, &argv);;
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

    if(!myid)
    {   
        /******for this piece of code nx and ny are assumed to be same please*******/
        printf("Enter the number of cells in X & Y direction\n");
        scanf("%d %d", &nx, &ny);
        nx += 1;
        ny += 1; 
        ny = nx; //forced to follow our assumption;
    }
    MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);

    MPI_Comm_rank(comm1d, &myid);
    MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);

    int rows;
    MPE_decomp1d(ny, numprocs, myid, &rows);
    int i, j;

    a = (double**)malloc(rows*sizeof(double*));
    b = (double**)malloc(rows*sizeof(double*));
    f = (double**)malloc(rows*sizeof(double*));
    for (i = 0; i < rows; i++)
    {
        a[i] = (double*)malloc(nx*sizeof(double));
        b[i] = (double*)malloc(nx*sizeof(double));
        f[i] = (double*)malloc(nx*sizeof(double));
    }
    onedinit(a, b, f, nx, rows, myid, numprocs);
    diffnorm = 0.0;
    it = 0;
    do
    {
        exchng1(a, nx, rows, comm1d, nbrbottom, nbrtop);
        sweep1d(a, f, nx, rows, b);
        exchng1(b, nx, rows, comm1d, nbrbottom, nbrtop);
        sweep1d(b, f, nx, rows, a);
        dwork = diff(a, b, nx, rows);

        /************printing matrix a after every iteration******/
        for (i = 0; i < rows; i++)
        {
            for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
            printf("\n");
        }

        //printf("%lfhehe\n", dwork);
        MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
        //printf("%dhere\n", myid);
    }
    while (++it < maxit && diffnorm > tolerance);

    MPI_Finalize();
    return 0;
}

在我的可能方程代码中获得集体中止

1 个答案: