Question

在调用SUB EXC_MPI（MOD01）时，我正在努力使用无效的内存引用，并且正好在MPI_StartAll（注释）中。

! ********** file mod01.f90 ************ !
MODULE MOD01

implicit none
include 'mpif.h'
! alternatively
! use mpi
! implicit none
PRIVATE
! ...
INTERFACE exc_mpi
   MODULE PROCEDURE exc_mpi
END INTERFACE
PUBLIC exc_mpi

CONTAINS

subroutine exc_mpi (X)
!! send and receive from procs PN0 <-> PN1 and PN0 <-> PN2
real, dimension (ni:ns, m, l), intent(inout) :: X

logical, save :: frstime=.true.
integer, save :: mpitype_sn, mpitype_sp, mpitype_rn, mpitype_rp
integer, save :: requests(4), reqcount
integer       :: istatus(MPI_STATUS_SIZE,4), ierr

if (frstime) then
   call exc_init()
   frstime = .false.
end if
call MPI_StartAll(reqcount,requests,ierr)         !!  <-- segfault here
call MPI_WaitAll(reqcount,requests,istatus,ierr)
return

contains

subroutine exc_init

integer :: i0, ierrs(12), ktag

reqcount = 0
ierrs=0
ktag = 1

! find i0

if ( condition1 ) then
! send to PN2
   call MPI_Type_Vector(m*l, messlengthup(PN2), ns-ni+1, MPI_REAL, mpitype_sn, ierrs(1))
   call MPI_Type_Commit(mpitype_sn, ierrs(3))
   call MPI_Send_Init(X(i0, 1, 1), 1, mpitype_sn, PN2-1, ktag, MPI_COMM_WORLD, requests(reqcount+1), ierrs(5))
! recieve from PN2
   call MPI_Type_Vector(m*l, messlengthdo(PN0), ns-ni+1, MPI_REAL, mpitype_rn, ierrs(2))
   call MPI_Type_Commit(mpitype_rn,ierrs(4))
   call MPI_Recv_Init(X(nend(irank)+1, 1, 1), 1, mpitype_rn, PN2-1, ktag+1, MPI_COMM_WORLD, requests(reqcount+2), ierrs(6))
   reqcount = reqcount + 2
end if

if ( condition2 ) then
!   send and rec PN0 <-> PN1
   reqcount = reqcount + 2
end if

return
end subroutine exc_init

end subroutine exc_mpi

! ...

END MODULE MOD01

来电来自：

! ********** file mod02.f90 ************ !
MODULE MOD02

use MOD01, only: exc_mpi 

IMPLICIT NONE
include 'mpif.h'
! alternatively
! use mpi
! implicit none
PRIVATE

! ...

INTERFACE MYSUB
   MODULE PROCEDURE MYSUB
END INTERFACE
PUBLIC MYSUB

CONTAINS

SUBROUTINE MYSUB (Y)

IMPLICIT NONE
REAL,    INTENT(INOUT)   :: Y(nl:nr, m, l) ! ni<=nl, nr>=ns
REAL, ALLOCATABLE, DIMENSION(:,:,:) :: Y0
!...
allocate ( Y0(ni-1:ns, 1:m, 1:l) )

DO i = 1, icount

   Y0(nl:nr,:,:) = Y(:,:,:)
   call exc_mpi ( Y0(ni:ns, :, :) )           !  <-- segfault here
   call mpi_barrier (mpi_comm_world, ierr)
   Y0(ni-1,:,:) = 0.
   CALL SUB01

END DO
deallocate (Y0)
RETURN

CONTAINS

SUBROUTINE SUB01
!...
   FRE: DO iterm = 1, m
      DIR: DO iterl = 1, l
         DO itern = nl, nr
!            Y(itern, iterm, iterl) = some_lin_combination(Y0)
         END DO
      END DO DIR
   END DO FRE

END SUBROUTINE SUB01

! ...
END MODULE MOD02

当MAIN（实际上是模块中的一个子句）第二次调用MYSUB（上面的代码）时，会在运行时引发分段错误。该错误不是系统性的，因为如果将作业分成一定数量的进程（例如NPMAX，依赖于分解的数组），程序就可以工作。由于程序比NPMAX更多，程序会出现段错误。更多关于环境条件：

几乎[编译器+ mpi独立]：[gfortran + ompi]，[gfortran + mpich]和[ifort + mpich]也出现同样的问题。
发生在基于deb的（glibc）mini hpc和pcs上（没有问题“严重”nec，sun，ibm）

可以看出，MOD02将交换过程（MOD01）传递给非连续的切片阵列Y0。我几乎可以克服故障（NPMAX增长一个数量级）的唯一方法是来回交换尺寸，但这会导致执行速度减慢大约2倍。我敢打赌永久交换X / Y / Y0尺寸会解决，但我不想放松像SUB01这样的嵌套循环的效率（第一维比其他尺寸大得多）。

实际上，MOD02会创建一个临时数组。明确地这样做并不能解决问题。

强制分配堆或堆栈无法解决。

那里有任何暗示吗？感谢您的阅读

更新：在每次调用时初始化（从sub exc_mpi中的if语句调用exc_init（））确实解决了，但由于MAIN（未列出）循环很多，因此效率非常低。

UPDATE2（在@Gilles之后）：这种解决方法也不起作用，即使传递一个连续的数组（在本例中为Y1）并且mpi没有创建临时数。

SUBROUTINE MYSUB (Y)

IMPLICIT NONE
REAL,    INTENT(INOUT)   :: Y(nl:nr, m, l) ! ni<=nl, nr>=ns
REAL, ALLOCATABLE, DIMENSION(:,:,:) :: Y0, Y1
!...
allocate ( Y0(ni-1:ns, 1:m, 1:l) )
allocate ( Y1(ni:ns, 1:m, 1:l) )

DO i = 1, icount

   Y1(nl:nr,:,:) = Y(:,:,:)
   call exc_mpi ( Y0 )           !  <-- segfault here
   call mpi_barrier (mpi_comm_world, ierr)
   Y0(nl:nr,:,:) = Y1(nl:nr,:,:)
   Y0(ni-1,:,:) = 0.
   CALL SUB01

END DO
deallocate (Y1)
deallocate (Y0)
etc ...

超过N个进程的分段错误

0 个答案: