MPI中的进程数量是否有限制?

时间:2014-08-09 09:45:02

标签: c parallel-processing pthreads mpi hpc

我正在阅读"使用MPI"并尝试自己执行代码。第6.3章中有一个网格分解代码。它编译时没有任何警告或错误,并且使用少量进程运行,但在我的笔记本电脑上失败但数量更大,比如30。我的笔记本电脑是4核,超线程和8G内存。 la_grid_2d_new的两个版本都不起作用,但第一个版本容忍一个更大的数字,比如35,但是40个进程失败了。我不知道为什么。请问你能帮帮我吗?非常感谢。

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

typedef struct
{
    int P, Q;
    int p, q;
    MPI_Comm grid_comm;
    MPI_Comm row_comm;
    MPI_Comm col_comm;
} LA_Grid_2d;

LA_Grid_2d *la_grid_2d_new(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm row, col;
    int my_rank, p, q;
    MPI_Comm_rank(comm, &my_rank);
    p=my_rank/Q;
    q=my_rank%Q;
    MPI_Comm_split(comm, p, q, &row);
    MPI_Comm_split(comm, q, p, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}

LA_Grid_2d *la_grid_2d_new_II(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm comm_2d, row, col;
    int my_rank, p, q;
    int dims[2]={P,Q}, local[2], period[2]={0,0}, remain_dims[2];

    MPI_Cart_create(comm, 2, dims, period, 1, &comm_2d);
    MPI_Comm_rank(comm, &my_rank);
    MPI_Cart_coords(comm_2d, my_rank, 2, local);
    p=local[0];
    q=local[1];
    remain_dims[0]=0;
    remain_dims[1]=1;
    MPI_Cart_sub(comm_2d, remain_dims, &row);
    remain_dims[0]=1;
    remain_dims[1]=0;
    MPI_Cart_sub(comm_2d, remain_dims, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}

void la_grid_2d_delete(LA_Grid_2d *grid)
{
    free(grid);
}
int main(int argc, char **argv)
{
    LA_Grid_2d *pgrid;
    int size, rank, dims[2]={0,0}, row, col;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if(rank==0)
        printf("size=%d rank=%d\n", size, rank);
    MPI_Dims_create(size, 2, dims);
//  pgrid=la_grid_2d_new(MPI_COMM_WORLD, dims[0], dims[1]);
    pgrid=la_grid_2d_new_II(MPI_COMM_WORLD, dims[0], dims[1]);
    if(rank==0)
        printf("dims[0]=%d dims[1]=%d\n", dims[0], dims[1]);
    MPI_Reduce(&rank, &row, 1, MPI_INT, MPI_SUM, 0, pgrid->row_comm);
    MPI_Reduce(&rank, &col, 1, MPI_INT, MPI_SUM, 0, pgrid->col_comm);
    la_grid_2d_delete(pgrid);
    MPI_Finalize();
    if(rank==0)
        printf("row=%d col=%d\n", row, col);
    return 0;
}

错误消息是:

shuang@phoebe:~/usingMPI$ mpiexec -n 20 ./grid
size=20 rank=0
dims[0]=5 dims[1]=4
row=6 col=40

shuang@phoebe:~/usingMPI$ mpiexec -n 30 ./grid
size=30 rank=0
dims[0]=6 dims[1]=5
[phoebe:14939] *** Process received signal ***
[phoebe:14939] Signal: Floating point exception (8)
[phoebe:14939] Signal code: Integer divide-by-zero (1)
[phoebe:14939] Failing at address: 0x7fb1e599e6f7
[phoebe:14939] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7fb1e5714cb0]
[phoebe:14939] [ 1] /usr/lib/libmpi.so.0(mca_topo_base_cart_coords+0x57) [0x7fb1e599e6f7]
[phoebe:14939] [ 2] /usr/lib/libmpi.so.0(mca_topo_base_cart_sub+0x166) [0x7fb1e599ec36]
[phoebe:14939] [ 3] /usr/lib/libmpi.so.0(PMPI_Cart_sub+0xba) [0x7fb1e596f34a]
[phoebe:14939] [ 4] ./grid(la_grid_2d_new_II+0xd6) [0x400df6]
[phoebe:14939] [ 5] ./grid(main+0x98) [0x400f07]
[phoebe:14939] [ 6] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed) [0x7fb1e536776d]
[phoebe:14939] [ 7] ./grid() [0x400b99]
[phoebe:14939] *** End of error message ***
--------------------------------------------------------------------------
mpiexec noticed that process rank 22 with PID 14939 on node phoebe exited on signal 8 (Floating point exception).
--------------------------------------------------------------------------

1 个答案:

答案 0 :(得分:2)

@Sean如果你想尝试另一个OpenMPI,你通常可以下载它并用

之类的东西进行编译
./configure --prefix=/opt/ompi-[version]
make
sudo make install

由于这将安装到非标准位置(以便以后轻松删除),您需要设置LD_LIBRARY_PATH = / opt / ompi- [version] / lib并指定mpicc和mpirun的完整路径以确保您调用正确的版本。在构建过程的某个地方,它会提醒您设置LD_LIBRARY_PATH。