对于大小文件的Fortran读写性能(速度)与MPI相比,我感到困惑。
我编写了以下简单的虚拟程序进行测试(只是将虚拟值写入文件):
PROGRAM test
!
IMPLICIT NONE
!
#if defined (__MPI)
!
! Include file for MPI
!
#if defined (__MPI_MODULE)
USE mpi
#else
INCLUDE 'mpif.h'
#endif
#else
! dummy world and null communicator
INTEGER, PARAMETER :: MPI_COMM_WORLD = 0
INTEGER, PARAMETER :: MPI_COMM_NULL = -1
INTEGER, PARAMETER :: MPI_COMM_SELF = -2
#endif
INTEGER (kind=MPI_OFFSET_KIND) :: lsize, pos, pos2
INTEGER, PARAMETER :: DP = 8
REAL(kind=DP), ALLOCATABLE, DIMENSION(:) :: trans_prob, array_cpu
INTEGER :: ierr, i, error, my_pool_id, world_comm
INTEGER (kind=DP) :: fil
REAL :: start, finish
INTEGER :: iunepmat, npool, arr_size, loop, pos3, j
real(dp):: dummy
integer*8 :: unf_recl
integer :: ios, direct_io_factor, recl
iunepmat = 10000
arr_size = 102400
loop = 500
! Initialize MPI
CALL MPI_INIT(ierr)
call MPI_COMM_DUP(MPI_COMM_WORLD, world_comm, ierr)
call MPI_COMM_RANK(world_comm,my_pool_id,error)
ALLOCATE(trans_prob(arr_size))
trans_prob(:) = 1.5d0
!Write using Fortran
CALL MPI_BARRIER(world_comm,error)
!
CALL cpu_time(start)
!
DO i=1, loop
! This writes also info on the record length using a real with 4 bytes.
OPEN(unit=10+my_pool_id, form='unformatted', position='append', action='write')
WRITE(10+my_pool_id ) trans_prob(:)
CLOSE(unit=10+my_pool_id)
ENDDO
CALL MPI_COMM_SIZE(world_comm, npool, error)
! Master collect and write
IF (my_pool_id==0) THEN
INQUIRE (IOLENGTH=direct_io_factor) dummy
unf_recl = direct_io_factor * int(arr_size * loop, kind=kind(unf_recl))
ALLOCATE (array_cpu( arr_size * loop ))
array_cpu(:) = 0.0d0
OPEN(unit=100,file='merged.dat',form='unformatted', status='new', position='append', action='write')
DO i=0, npool - 1
OPEN(unit=10+i,form='unformatted', status ='old', access='direct', recl = unf_recl )
READ(unit=10+i, rec=1) array_cpu(:)
CLOSE(unit=10+i)
WRITE(unit=100) array_cpu(:)
ENDDO
CLOSE(unit=100)
DEALLOCATE (array_cpu)
ENDIF
call cpu_time(finish)
!Print time
CALL MPI_BARRIER(world_comm,error)
IF (my_pool_id==0) print*, ' Fortran time', finish-start
!Write using MPI
CALL MPI_BARRIER(world_comm,error)
!
CALL cpu_time(start)
!
lsize = INT( arr_size , kind = MPI_OFFSET_KIND)
pos = 0
pos2 = 0
CALL MPI_FILE_OPEN(world_comm, 'MPI.dat',MPI_MODE_WRONLY + MPI_MODE_CREATE,MPI_INFO_NULL,iunepmat,ierr)
DO i=1, loop
pos = pos2 + INT( arr_size * (my_pool_id), kind = MPI_OFFSET_KIND ) * 8_MPI_OFFSET_KIND
CALL MPI_FILE_SEEK(iunepmat, pos, MPI_SEEK_SET, ierr)
CALL MPI_FILE_WRITE(iunepmat, trans_prob, lsize, MPI_DOUBLE_PRECISION,MPI_STATUS_IGNORE,ierr)
pos2 = pos2 + INT( arr_size * (npool -1), kind = MPI_OFFSET_KIND ) * 8_MPI_OFFSET_KIND
ENDDO
!
CALL MPI_FILE_CLOSE(iunepmat,ierr)
CALL cpu_time(finish)
CALL MPI_BARRIER(world_comm,error)
IF (my_pool_id==0) print*, ' MPI time', finish-start
DEALLOCATE (trans_prob)
END PROGRAM
编译过程如下:
mpif90 -O3 -x f95-cpp-input -D__FFTW -D__MPI -D__SCALAPACK test_mpi2.f90 -o a.x
,然后与4个内核并行运行:
mpirun -np 4 ./a.x
我得到以下结果:
循环尺寸1
数组大小10,240,000
文件大小:313 Mb
Fortran时间0.237030014秒
MPI时间0.164155006秒
循环尺寸10
数组大小1,024,000
文件大小:313 Mb
Fortran时间0.242821991秒
MPI时间0.172048002秒
循环尺寸100
数组大小102,400
文件大小:313 Mb
Fortran时间0.235879987秒
MPI时间9.78289992E-02秒
循环尺寸50
数组大小1,024,000
文件大小:1.6G
Fortran时间1.60272002秒
MPI时间3.40623116秒
循环尺寸500
数组大小102,400
文件大小:1.6G
Fortran时间1.44547606秒
MPI时间3.38340592秒
如您所见,较大文件的MPI性能显着下降。是否可以提高大文件的MPI性能?
这是预期的行为吗?
非常感谢, 山姆