我正在尝试并行处理Fortran代码,该代码一时将大量数字写入格式化的输出中。一些简单的分析表明,大多数CPU时间都花费在格式转换上,因此我想到了与字符缓冲区并行进行格式化,然后再将未格式化的缓冲区写入文件中。
我的概念证明如下:
program parawrite
implicit none
integer (kind = 4) :: i, j, tstart, tstop, rate
integer (kind = 4), parameter :: bufsize = 100000, n = 10000000, llen = 22
character (kind=1, len=:), allocatable :: buf
real (kind=8), dimension(n) :: a
! some input
do i = 1, n
a(i) = dble(i) * dble(i)
enddo
! formated writes for reference
open(unit=10, file="out1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(E21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Formated write: ', dble(tstop - tstart) / dble(rate), 's'
close(10)
! parallel stuff
open(unit=10, file="out2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel private(buf, j)
allocate(character(bufsize * llen) :: buf)
j = 0;
!$omp do ordered schedule(dynamic,bufsize)
do i = 1, n
write (buf(j*llen+1:(j+1)*llen),"(E21.15,A1)") a(i), char(10)
j = j + 1
if (mod(i, bufsize) == 0) then
!$omp ordered
write (10) buf
!$omp end ordered
j = 0
end if
end do
deallocate(buf)
!$omp end parallel
close(10)
call system_clock(tstop, rate);
print *, 'Parallel write: ', dble(tstop - tstart) / dble(rate), 's'
end program parawrite
但是,当我运行它时,不仅在单线程时并行版本会慢很多,而且扩展性也不会太大...
$ gfortran -O2 -fopenmp writetest.f90
$ OMP_NUM_THREADS=1 ./a.out
Formated write: 11.330000000000000 s
Parallel write: 15.625999999999999 s
$ OMP_NUM_THREADS=6 ./a.out
Formated write: 11.331000000000000 s
Parallel write: 6.1799999999999997 s
我的第一个问题是如何使单线程的速度相同?将缓冲区写入文件所花费的时间可以忽略不计,那么为什么写入缓冲区要比直接写入文件时慢呢?
我的第二个问题是为什么缩放如此糟糕?我有一个使用sprintf和fwrite的等效C代码,可以得到几乎完美的线性缩放(如果需要,可以发布代码),但是使用Fortran时,我只能将6个线程的运行时间减少到40%左右(使用CI可以减少在相同的线程数下达到18%)。它仍然比串行版本要快,但是我希望可以对此进行改进。
答案 0 :(得分:1)
从一些实验来看,如果一次将一个数组元素转换为一个内部文件,则内部文件的运行速度似乎很慢。外部文件也是如此,但是内部文件的减速程度似乎要大得多(出于某种原因...)。因此,我修改了代码,以便一次转换一组数组元素,然后通过流输出将其写入外部文件。下面,比较了四种模式:
其中,Parallel(2)+ sprintf(在代码中标记为*2
)是最快的,而Parallel(2)+每个元素的写入(标记为*1
)是最慢的(时序在表中显示为Parallel (*)
,由于某种原因无法与OpenMP一起扩展)。我猜sprintf最快,可能是因为内部检查和开销等最少(只是一个猜测!)
结果(修改后的代码请参见底部)
$ gcc -O3 -c conv.c && gfortran -O3 -fopenmp test.f90 conv.o
# Machine: Core i7-8550U (1.8GHz), 4-core/8-thread, Ubuntu18.04 (GCC7.3.0)
# Note: The amount of data has been reduced to 1/5 of the
# original code, n = bufsize * 20, but the relative
# timing results remain the same even for larger data.
$ OMP_NUM_THREADS=1 ./a.out
Sequential (1): 2.0080000000000000 s
Sequential (2): 1.6510000000000000 s
Parallel (1): 1.6960000000000000 s
Parallel (2): 1.2640000000000000 s
Parallel (*): 3.1480000000000001 s
$ OMP_NUM_THREADS=2 ./a.out
Sequential (1): 1.9990000000000001 s
Sequential (2): 1.6479999999999999 s
Parallel (1): 0.98599999999999999 s
Parallel (2): 0.72999999999999998 s
Parallel (*): 1.8600000000000001 s
$ OMP_NUM_THREADS=4 ./a.out
Sequential (1): 2.0289999999999999 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.61199999999999999 s
Parallel (2): 0.49399999999999999 s
Parallel (*): 1.4470000000000001 s
$ OMP_NUM_THREADS=8 ./a.out
Sequential (1): 2.0059999999999998 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.56200000000000006 s
Parallel (2): 0.41299999999999998 s
Parallel (*): 1.7689999999999999 s
main.f90:
program main
implicit none
integer :: i, j, k, tstart, tstop, rate, idiv, ind1, ind2
integer, parameter :: bufsize = 100000, n = bufsize * 20, llen = 22, ndiv = 8
character(len=:), allocatable :: buf(:), words(:)
character(llen + 1) :: word
real(8), allocatable :: a(:)
allocate( a( n ) )
! Some input
do i = 1, n
a(i) = dble(i)**2
enddo
!.........................................................
! Formatted writes (1).
open(unit=10, file="dat_seq1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(ES21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Sequential (1):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Formatted writes (2).
open(unit=10, file="dat_seq2.txt", form="formatted")
call system_clock(tstart, rate);
write( 10, "(ES21.15)" ) a
! write( 10, "(ES21.15)" ) ( a( k ), k = 1, n )
call system_clock(tstop, rate);
print *, 'Sequential (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Parallel writes (1): make a formatted string for many elements at once
allocate( character( llen * bufsize / ndiv ) :: buf( ndiv ) )
open(unit=10, file="dat_par1.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
do i = 1, n, bufsize
!$omp parallel do private( idiv, ind1, ind2, k ) shared( i, buf, a )
do idiv = 1, ndiv
ind1 = i + (idiv - 1) * bufsize / ndiv
ind2 = ind1 + bufsize / ndiv - 1
write( buf( idiv ),"(*(ES21.15, A1))") &
( a( k ), char(10), k = ind1, ind2 )
enddo
!$omp end parallel do
write(10) buf
end do
call system_clock(tstop, rate);
print *, 'Parallel (1):', dble(tstop - tstart) / dble(rate), 's'
deallocate(buf)
close(10)
!.........................................................
! Parallel writes (2): sprintf vs write for each element
allocate( character( llen ) :: words( n ) )
open(unit=10, file="dat_par2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel do private( i, word ) shared( a, words )
do i = 1, n
! write( word, "(ES21.15, A1)" ) a( i ), char(10) !! slow (*1)
call conv( word, a( i ) ) !! sprintf (*2)
words( i ) = word( 1 : llen )
enddo
!$omp end parallel do
write( 10 ) words
call system_clock(tstop, rate);
print *, 'Parallel (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
end program
转化:
#include <stdio.h>
void conv_( char *buf, double *val )
{
sprintf( buf, "%21.15E\n", *val );
}