在调用SUB EXC_MPI(MOD01)时,我正在努力使用无效的内存引用,并且正好在MPI_StartAll(注释)中。
! ********** file mod01.f90 ************ !
MODULE MOD01
implicit none
include 'mpif.h'
! alternatively
! use mpi
! implicit none
PRIVATE
! ...
INTERFACE exc_mpi
MODULE PROCEDURE exc_mpi
END INTERFACE
PUBLIC exc_mpi
CONTAINS
subroutine exc_mpi (X)
!! send and receive from procs PN0 <-> PN1 and PN0 <-> PN2
real, dimension (ni:ns, m, l), intent(inout) :: X
logical, save :: frstime=.true.
integer, save :: mpitype_sn, mpitype_sp, mpitype_rn, mpitype_rp
integer, save :: requests(4), reqcount
integer :: istatus(MPI_STATUS_SIZE,4), ierr
if (frstime) then
call exc_init()
frstime = .false.
end if
call MPI_StartAll(reqcount,requests,ierr) !! <-- segfault here
call MPI_WaitAll(reqcount,requests,istatus,ierr)
return
contains
subroutine exc_init
integer :: i0, ierrs(12), ktag
reqcount = 0
ierrs=0
ktag = 1
! find i0
if ( condition1 ) then
! send to PN2
call MPI_Type_Vector(m*l, messlengthup(PN2), ns-ni+1, MPI_REAL, mpitype_sn, ierrs(1))
call MPI_Type_Commit(mpitype_sn, ierrs(3))
call MPI_Send_Init(X(i0, 1, 1), 1, mpitype_sn, PN2-1, ktag, MPI_COMM_WORLD, requests(reqcount+1), ierrs(5))
! recieve from PN2
call MPI_Type_Vector(m*l, messlengthdo(PN0), ns-ni+1, MPI_REAL, mpitype_rn, ierrs(2))
call MPI_Type_Commit(mpitype_rn,ierrs(4))
call MPI_Recv_Init(X(nend(irank)+1, 1, 1), 1, mpitype_rn, PN2-1, ktag+1, MPI_COMM_WORLD, requests(reqcount+2), ierrs(6))
reqcount = reqcount + 2
end if
if ( condition2 ) then
! send and rec PN0 <-> PN1
reqcount = reqcount + 2
end if
return
end subroutine exc_init
end subroutine exc_mpi
! ...
END MODULE MOD01
来电来自:
! ********** file mod02.f90 ************ !
MODULE MOD02
use MOD01, only: exc_mpi
IMPLICIT NONE
include 'mpif.h'
! alternatively
! use mpi
! implicit none
PRIVATE
! ...
INTERFACE MYSUB
MODULE PROCEDURE MYSUB
END INTERFACE
PUBLIC MYSUB
CONTAINS
SUBROUTINE MYSUB (Y)
IMPLICIT NONE
REAL, INTENT(INOUT) :: Y(nl:nr, m, l) ! ni<=nl, nr>=ns
REAL, ALLOCATABLE, DIMENSION(:,:,:) :: Y0
!...
allocate ( Y0(ni-1:ns, 1:m, 1:l) )
DO i = 1, icount
Y0(nl:nr,:,:) = Y(:,:,:)
call exc_mpi ( Y0(ni:ns, :, :) ) ! <-- segfault here
call mpi_barrier (mpi_comm_world, ierr)
Y0(ni-1,:,:) = 0.
CALL SUB01
END DO
deallocate (Y0)
RETURN
CONTAINS
SUBROUTINE SUB01
!...
FRE: DO iterm = 1, m
DIR: DO iterl = 1, l
DO itern = nl, nr
! Y(itern, iterm, iterl) = some_lin_combination(Y0)
END DO
END DO DIR
END DO FRE
END SUBROUTINE SUB01
! ...
END MODULE MOD02
当MAIN(实际上是模块中的一个子句)第二次调用MYSUB(上面的代码)时,会在运行时引发分段错误。 该错误不是系统性的,因为如果将作业分成一定数量的进程(例如NPMAX,依赖于分解的数组),程序就可以工作。由于程序比NPMAX更多,程序会出现段错误。 更多关于环境条件:
可以看出,MOD02将交换过程(MOD01)传递给非连续的切片阵列Y0。我几乎可以克服故障(NPMAX增长一个数量级)的唯一方法是来回交换尺寸,但这会导致执行速度减慢大约2倍。 我敢打赌永久交换X / Y / Y0尺寸会解决,但我不想放松像SUB01这样的嵌套循环的效率(第一维比其他尺寸大得多)。
实际上,MOD02会创建一个临时数组。明确地这样做并不能解决问题。
强制分配堆或堆栈无法解决。
那里有任何暗示吗? 感谢您的阅读
更新: 在每次调用时初始化(从sub exc_mpi中的if语句调用exc_init())确实解决了,但由于MAIN(未列出)循环很多,因此效率非常低。
UPDATE2(在@Gilles之后): 这种解决方法也不起作用,即使传递一个连续的数组(在本例中为Y1)并且mpi没有创建临时数。
SUBROUTINE MYSUB (Y)
IMPLICIT NONE
REAL, INTENT(INOUT) :: Y(nl:nr, m, l) ! ni<=nl, nr>=ns
REAL, ALLOCATABLE, DIMENSION(:,:,:) :: Y0, Y1
!...
allocate ( Y0(ni-1:ns, 1:m, 1:l) )
allocate ( Y1(ni:ns, 1:m, 1:l) )
DO i = 1, icount
Y1(nl:nr,:,:) = Y(:,:,:)
call exc_mpi ( Y0 ) ! <-- segfault here
call mpi_barrier (mpi_comm_world, ierr)
Y0(nl:nr,:,:) = Y1(nl:nr,:,:)
Y0(ni-1,:,:) = 0.
CALL SUB01
END DO
deallocate (Y1)
deallocate (Y0)
etc ...