Question

我的程序的目的是通过将内部导体和外部导体分成网格然后分成网格切片来计算内部导体与外部导体之间的静电势。每个处理器获取一个切片，并在每个切片上运行计算。我使用MPI_Isend和MPI_Irecv在处理器之间发送数据。测试代码时，我遇到了段错误：

[physnode5:81440] *** Process received signal ***
[physnode5:81440] Signal: Segmentation fault (11)
[physnode5:81440] Signal code: Address not mapped (1)
[physnode5:81440] Failing at address: 0x58
[physnode5:81440] [ 0] /lib64/libpthread.so.0(+0xf5d0)[0x2ab8069df5d0]
[physnode5:81440] [ 1] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(ompi_request_default_wait+0xd)[0x2ab8066495ed]
[physnode5:81440] [ 2] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(MPI_Wait+0x5d)[0x2ab80667a00d]
[physnode5:81440] [ 3] ./mpi_tezt.exe[0x400ffc]
[physnode5:81440] [ 4] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2ab806c0e3d5]
[physnode5:81440] [ 5] ./mpi_tezt.exe[0x4009b9]
[physnode5:81440] *** End of error message ***

执行此段代码时。请不要我已将其切入群集。文件名是mpi_tezt.exe（是的，我拼错了）。我已经检查了要发送的数组是否已正确分配，并且send和recv没有发送或接收不存在的数据（即发送超出数组范围的数据）。我的MPI_Isend和MPI_Irecv代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>

int main(int argc, char *argv[])
{
  /*MPI Specific Variables*/
  int my_size, my_rank, up, down;
  MPI_Request reqU, reqD, sreqU, sreqD;
  MPI_Status rUstatus, rDstatus, sUstatus, sDstatus;

   /*Physical Dimensions*/
  double Linner = 5.0;/*mm*/
  double Rinner = 1.0;/*mm*/
  double phi_0 = 1000.0;/*V*/

  /*Other Variables*/
  int grid_size = 100;
  int slice;
  int x,y;
  double grid_res_y = 0.2;
  double grid_res_x = 0.1;
  int xboundary, yboundary;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &my_size);

  /*Determining neighbours*/
  if (my_rank != 0) /*if statemets used to stop highest and lowest rank neighbours arent outside 0 - my_size-1 range of ranks*/
    {
      up = my_rank-1;
    }
  else
    {
      up = 0;
    }

  if(my_rank != my_size-1)
    {
      down = my_rank+1;
    }
  else
    {
      down = my_size-1;
    }

  /*cross-check: presumed my_size is a factor of gridsize else there are odd sized slices and this is not coded for*/
  if (grid_size%my_size != 0)
    {
      printf("ERROR - number of procs =  %d, this is not a factor of grid_size %d\n", my_size, grid_size);
      exit(0);
    }

  /*Set Up Distributed Data Approach*/
  slice = grid_size/my_size;

  yboundary = Linner/grid_res_y; /*y grid index of inner conductor wall*/ 
  xboundary = Rinner/grid_res_x; /*x grid and individual array index of inner conductor wall*/


  double phi[slice+2][grid_size]; /*extra 2 rows to allow for halo data*/

  for (y=0; y < slice+2; y++)
    {
      for (x=0; x < grid_size; x++)
        { 
          phi[y][x] = 0.0;
        }
    }

  if(my_rank == 0) /*Boundary Containing rank does 2 loops. One over part with inner conductor and one over part without inner conductor*/
    {
      for(y=0; y < slice+1; y++)
        {
          for(x=xboundary; x < grid_size; x++)
            {
              phi[y][x] = phi_0;
            }
        }   
    }


  if (my_rank < my_size-1)
    {
      /*send top most strip up one node to be recieved as bottom halo*/
      MPI_Isend(&phi[1][0], grid_size  , MPI_DOUBLE, down, 1, MPI_COMM_WORLD, &sreqU);  
      /*recv top halo from up one node*/
      MPI_Irecv(&phi[slice+1][0], grid_size, MPI_DOUBLE, down, 2, MPI_COMM_WORLD, &reqU);
    }

  if (my_rank > 0)
    {
      /*recv top halo from down one node*/
      MPI_Irecv(&phi[0][0], grid_size , MPI_DOUBLE, up, 2, MPI_COMM_WORLD, &reqD);
      /*send bottom most strip down one node to be recieved as top halo*/
      MPI_Isend(&phi[slice][0], grid_size , MPI_DOUBLE, up, 1, MPI_COMM_WORLD, &sreqD);
    }

  if (my_rank<my_size-1)
    {
      /*Wait for send to down one rank to complete*/
      MPI_Wait(&sreqD, &sDstatus);
      /*Wait for recieve from up one rank to complete*/
      MPI_Wait(&reqD, &rDstatus);
    }

  if (my_rank>0)
    {
      /*Wait for send to up down one rank to complete*/
      MPI_Wait(&sreqU, &sUstatus);
      /*Wait for recieve from down one rank to complete*/
      MPI_Wait(&reqU, &rUstatus);
    }


  MPI_Finalize();

  return 0;
}

我已经在2个处理器（等级0和1）上进行了测试，希望将其扩展到更多。

有什么想法可能会造成问题吗？

Answer 1

您是在第一 MPI_Wait中犯错（排名0）。这是下面的示例代码中的步骤7。

使用mpirun -np 2 ./whatever：

看来sReqD的不是设置正确。这是由等级1在第5步设置的。

但是，第7步由等级0执行，未未设置sReqD。

因此，您需要调整if语句以正确匹配MPI_Wait等的排名。

这是带有一些调试printf语句的代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>

int
main(int argc, char *argv[])
{
    /* MPI Specific Variables */
    int my_size,
     my_rank,
     up,
     down;
    MPI_Request reqU,
     reqD,
     sreqU,
     sreqD;
    MPI_Status rUstatus,
     rDstatus,
     sUstatus,
     sDstatus;

    /* Physical Dimensions */
    double Linner = 5.0;                /* mm */
    double Rinner = 1.0;                /* mm */
    double phi_0 = 1000.0;

    /*V*/
        /* Other Variables */
    int grid_size = 100;
    int slice;
    int x,
     y;
    double grid_res_y = 0.2;
    double grid_res_x = 0.1;

    int xboundary,
     yboundary;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &my_size);

    /* Determining neighbours */
    /* if statemets used to stop highest and lowest rank neighbours arent
    outside 0 - my_size-1 range of ranks */
    if (my_rank != 0) {
        up = my_rank - 1;
    }
    else {
        up = 0;
    }

    if (my_rank != my_size - 1) {
        down = my_rank + 1;
    }
    else {
        down = my_size - 1;
    }

    printf("my_rank=%d my_size=%d up=%d down=%d\n",my_rank,my_size,up,down);

    /* cross-check: presumed my_size is a factor of gridsize else there are
    odd sized slices and this is not coded for */
    if (grid_size % my_size != 0) {
        printf("ERROR - number of procs =  %d, this is not a factor of grid_size %d\n", my_size, grid_size);
        exit(0);
    }

    /* Set Up Distributed Data Approach */
    slice = grid_size / my_size;

    /* y grid index of inner conductor wall */
    yboundary = Linner / grid_res_y;
    /* x grid and individual array index of inner conductor wall */
    xboundary = Rinner / grid_res_x;

    if (my_rank == 0) {
        printf("Linner=%g grid_res_y=%g yboundary=%d\n",
            Linner,grid_res_y,yboundary);
        printf("Rinner=%g grid_res_x=%g xboundary=%d\n",
            Rinner,grid_res_x,xboundary);
        printf("slice=%d grid_size=%d phi=%ld\n",
            slice,grid_size,sizeof(double) * (slice + 2) * grid_size);
    }

    /* extra 2 rows to allow for halo data */
    double phi[slice + 2][grid_size];

    for (y = 0; y < slice + 2; y++) {
        for (x = 0; x < grid_size; x++) {
            phi[y][x] = 0.0;
        }
    }

    /* Boundary Containing rank does 2 loops. One over part with inner
    conductor and one over part without inner conductor */
    if (my_rank == 0) {
        for (y = 0; y < slice + 1; y++) {
            for (x = xboundary; x < grid_size; x++) {
                phi[y][x] = phi_0;
            }
        }
    }

    if (my_rank < my_size - 1) {
        /* send top most strip up one node to be recieved as bottom halo */
        printf("1: my_rank=%d MPI_Isend\n",my_rank);
        MPI_Isend(&phi[1][0], grid_size, MPI_DOUBLE, down, 1, MPI_COMM_WORLD,
            &sreqU);

        /* recv top halo from up one node */
        printf("2: my_rank=%d MPI_Irecv\n",my_rank);
        MPI_Irecv(&phi[slice + 1][0], grid_size, MPI_DOUBLE, down, 2,
            MPI_COMM_WORLD, &reqU);

        printf("3: my_rank=%d\n",my_rank);
    }

    if (my_rank > 0) {
        /* recv top halo from down one node */
        printf("4: my_rank=%d MPI_Irecv\n",my_rank);
        MPI_Irecv(&phi[0][0], grid_size, MPI_DOUBLE, up, 2, MPI_COMM_WORLD,
            &reqD);

        /* send bottom most strip down one node to be recieved as top halo */
        printf("5: my_rank=%d MPI_Isend\n",my_rank);
        MPI_Isend(&phi[slice][0], grid_size, MPI_DOUBLE, up, 1, MPI_COMM_WORLD,
            &sreqD);

        printf("6: my_rank=%d\n",my_rank);
    }

    if (my_rank < my_size - 1) {
        /* Wait for send to down one rank to complete */
        printf("7: my_rank=%d\n",my_rank);
        MPI_Wait(&sreqD, &sDstatus);
        printf("8: my_rank=%d\n",my_rank);

        /* Wait for recieve from up one rank to complete */
        printf("9: my_rank=%d\n",my_rank);
        MPI_Wait(&reqD, &rDstatus);
        printf("10: my_rank=%d\n",my_rank);
    }

    if (my_rank > 0) {
        /* Wait for send to up down one rank to complete */
        printf("11: my_rank=%d\n",my_rank);
        MPI_Wait(&sreqU, &sUstatus);
        printf("12: my_rank=%d\n",my_rank);

        /* Wait for recieve from down one rank to complete */
        printf("12: my_rank=%d\n",my_rank);
        MPI_Wait(&reqU, &rUstatus);
        printf("13: my_rank=%d\n",my_rank);
    }

    MPI_Finalize();

    return 0;
}

这是输出。请注意，将打印第7步（在第0个排名的第一个MPI_Wait之前之前）。但是，排名0永远不会到达第8步（调用之后的printf 之后）

my_rank=0 my_size=2 up=0 down=1
Linner=5 grid_res_y=0.2 yboundary=25
Rinner=1 grid_res_x=0.1 xboundary=10
slice=50 grid_size=100 phi=41600
1: my_rank=0 MPI_Isend
2: my_rank=0 MPI_Irecv
3: my_rank=0
7: my_rank=0
my_rank=1 my_size=2 up=0 down=1
4: my_rank=1 MPI_Irecv
5: my_rank=1 MPI_Isend
6: my_rank=1
11: my_rank=1
[manderly:230404] *** Process received signal ***
[manderly:230403] *** Process received signal ***
[manderly:230403] Signal: Segmentation fault (11)
[manderly:230403] Signal code: Address not mapped (1)
[manderly:230403] Failing at address: 0x58
[manderly:230404] Signal: Segmentation fault (11)
[manderly:230404] Signal code: Address not mapped (1)
[manderly:230404] Failing at address: 0x58
[manderly:230403] [ 0] [manderly:230404] [ 0] /lib64/libpthread.so.0(+0x121c0)/lib64/libpthread.so.0(+0x121c0)[0x7fa5478341c0]
[0x7fa0ebe951c0]
[manderly:230404] [ 1] [manderly:230403] [ 1] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa0ec0e9a81]
[manderly:230404] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa547a88a81]
[manderly:230403] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa0ec12c350]
[manderly:230404] [ 3] ./fix2[0x400f93]
[manderly:230404] [ 4] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa547acb350]
[manderly:230403] [ 3] ./fix2[0x400ef7]
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa0ebaedfea]
[manderly:230404] [ 5] ./fix2[0x40081a[manderly:230403] [ 4] ]
[manderly:230404] *** End of error message ***
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa54748cfea]
[manderly:230403] [ 5] ./fix2[0x40081a]
[manderly:230403] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node manderly exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------

使用MPI_Isend时出现细分错误

1 个答案: