Fortran与MPI文件写入/读取的性能

时间:2019-05-02 11:32:49

标签: optimization fortran mpi writing

对于大小文件的Fortran读写性能(速度)与MPI相比,我感到困惑。

我编写了以下简单的虚拟程序进行测试(只是将虚拟值写入文件):

PROGRAM test
! 
IMPLICIT NONE
! 
#if defined (__MPI)
        !
        !     Include file for MPI
        !
#if defined (__MPI_MODULE)
        USE mpi
#else
        INCLUDE 'mpif.h'
#endif
#else
        ! dummy world and null communicator
        INTEGER, PARAMETER :: MPI_COMM_WORLD =  0
        INTEGER, PARAMETER :: MPI_COMM_NULL  = -1
        INTEGER, PARAMETER :: MPI_COMM_SELF  = -2
#endif

INTEGER (kind=MPI_OFFSET_KIND) :: lsize, pos, pos2

INTEGER, PARAMETER :: DP = 8
REAL(kind=DP), ALLOCATABLE, DIMENSION(:) :: trans_prob, array_cpu
INTEGER ::  ierr, i, error, my_pool_id, world_comm
INTEGER (kind=DP) :: fil
REAL :: start, finish
INTEGER :: iunepmat, npool, arr_size, loop, pos3, j
  real(dp):: dummy
  integer*8 :: unf_recl
integer :: ios, direct_io_factor, recl

iunepmat = 10000
arr_size = 102400
loop     = 500

! Initialize MPI
CALL MPI_INIT(ierr)
call MPI_COMM_DUP(MPI_COMM_WORLD, world_comm, ierr)
call MPI_COMM_RANK(world_comm,my_pool_id,error)


ALLOCATE(trans_prob(arr_size))
trans_prob(:) = 1.5d0

!Write using Fortran
CALL MPI_BARRIER(world_comm,error)
!
CALL cpu_time(start)
!
DO i=1, loop 
  ! This writes also info on the record length using a real with 4 bytes. 
  OPEN(unit=10+my_pool_id, form='unformatted', position='append', action='write')
  WRITE(10+my_pool_id ) trans_prob(:)
  CLOSE(unit=10+my_pool_id)
ENDDO

CALL MPI_COMM_SIZE(world_comm, npool, error)

! Master collect and write
IF (my_pool_id==0) THEN
  INQUIRE (IOLENGTH=direct_io_factor) dummy
  unf_recl = direct_io_factor * int(arr_size * loop, kind=kind(unf_recl)) 
  ALLOCATE (array_cpu( arr_size * loop ))
  array_cpu(:) = 0.0d0
  OPEN(unit=100,file='merged.dat',form='unformatted', status='new', position='append', action='write')
  DO i=0, npool - 1
    OPEN(unit=10+i,form='unformatted', status ='old', access='direct', recl = unf_recl )
    READ(unit=10+i, rec=1) array_cpu(:)
    CLOSE(unit=10+i)
    WRITE(unit=100) array_cpu(:)
  ENDDO
  CLOSE(unit=100)
  DEALLOCATE (array_cpu)
ENDIF

call cpu_time(finish)

!Print time
CALL MPI_BARRIER(world_comm,error)
IF (my_pool_id==0) print*, ' Fortran time', finish-start

!Write using MPI
CALL MPI_BARRIER(world_comm,error)
!
CALL cpu_time(start)
! 
lsize = INT( arr_size , kind = MPI_OFFSET_KIND)
pos = 0
pos2 = 0
CALL MPI_FILE_OPEN(world_comm, 'MPI.dat',MPI_MODE_WRONLY + MPI_MODE_CREATE,MPI_INFO_NULL,iunepmat,ierr)
DO i=1, loop
  pos = pos2 + INT( arr_size * (my_pool_id),  kind = MPI_OFFSET_KIND ) * 8_MPI_OFFSET_KIND
  CALL MPI_FILE_SEEK(iunepmat, pos, MPI_SEEK_SET, ierr)
  CALL MPI_FILE_WRITE(iunepmat, trans_prob, lsize, MPI_DOUBLE_PRECISION,MPI_STATUS_IGNORE,ierr)
  pos2 = pos2 + INT( arr_size * (npool -1),  kind = MPI_OFFSET_KIND ) * 8_MPI_OFFSET_KIND
ENDDO
!
CALL MPI_FILE_CLOSE(iunepmat,ierr)
CALL cpu_time(finish)

CALL MPI_BARRIER(world_comm,error)
IF (my_pool_id==0) print*, ' MPI time', finish-start

DEALLOCATE (trans_prob)

END PROGRAM 

编译过程如下:

mpif90  -O3  -x f95-cpp-input -D__FFTW -D__MPI -D__SCALAPACK  test_mpi2.f90 -o a.x

,然后与4个内核并行运行:

mpirun -np 4 ./a.x

我得到以下结果:

循环尺寸1

数组大小10,240,000

文件大小:313 Mb

Fortran时间0.237030014秒

MPI时间0.164155006秒


循环尺寸10

数组大小1,024,000

文件大小:313 Mb

Fortran时间0.242821991秒

MPI时间0.172048002秒


循环尺寸100

数组大小102,400

文件大小:313 Mb

Fortran时间0.235879987秒

MPI时间9.78289992E-02秒


循环尺寸50

数组大小1,024,000

文件大小:1.6G

Fortran时间1.60272002秒

MPI时间3.40623116秒


循环尺寸500

数组大小102,400

文件大小:1.6G

Fortran时间1.44547606秒

MPI时间3.38340592秒


如您所见,较大文件的MPI性能显着下降。是否可以提高大文件的MPI性能?

这是预期的行为吗?

非常感谢, 山姆

0 个答案:

没有答案