我在Fortran遇到了一个奇怪的问题,我认为这是由不必要的数组副本引起的。当我在一个过程中有一个本地数组时代码很快。但是一旦我更改子例程以将 list 作为 intent(out)参数(或任何其他方法!)返回,代码就会减慢50倍!
示意图,代码看起来像这样(完整代码最后附加):
module
type sys
integer :: num_coor
real, allocatable :: pos(:,:) !--> shape will be (2,num_coor)
...
contains
procedure :: look
procedure :: change
...
end type
contains
subroutine look(obj,j)
class(sys), intent(in) :: obj
integer, intent(in) :: j
integer :: n, list(100) !*** problem when intent(out) ***
n = 0
do k = 1, num_coor
if( k close to j) then
n = n + 1; list(n) = k ! usually n<~70
end if
end do
end subroutine
...
end module
主程序如下:
do
select random j
call obj%look(j)
call obj%change(j)
end do
在这种形式下,每个周期大约需要 50纳秒才能完成,但是当我希望访问数组 list 时,它是在类型绑定过程中创建的 >看看,从主要区块(到目前为止我尝试过的任何方式)完成一个周期需要 1500纳秒。
我想知道是否有人遇到类似的问题,并有任何建议。 如果有人有兴趣,我可以发送原始代码。
P.S。我尝试过使用gfortran 6.4.0,7.3.0和ifort 14和-O3等进行编译。
如果您想尝试,完整代码就在这里。 您可以跳过大部分内容,只关注“!&lt; --- here”指定的部分。
module mover
private
public keeper, g_pp2d, pr
integer, parameter :: sp = kind(1.0), &
dp = kind(1.0d0), &
pr = sp
type pp2d
integer :: nop
real(pr), &
allocatable :: pos(:,:)
real(pr) :: lx
real(pr) :: ly
contains
procedure :: load => load
procedure :: fill_random => fill_random
end type pp2d
type keeper
integer :: cap
integer :: noe
integer, &
allocatable :: elm(:)
contains
procedure :: init => init_keeper
procedure :: add => add_to_keeper
procedure :: pop => pop_from_keeper
procedure :: ext => extend_keeper
procedure :: show => print_keeper
end type keeper
interface keeper
procedure construct_keeper
end interface
integer, parameter :: nbbsk(2,8) = &
reshape( (/1,0,0,1,-1,0,0,-1, &
1,-1,1,1,-1,1,-1,-1/), &
(/2,8/) )
integer, parameter :: med1(-1:1,-1:1) = reshape( &
[8,4,5,3,0,1,7,2,6], [3,3])
real(pr) :: med2(2,-1:1,-1:1)
type, &
extends(keeper) :: basket
integer :: su(0:8)
integer :: cc(2)
real(pr) :: llim(2)
real(pr) :: ulim(2)
end type basket
type, &
extends(pp2d) :: g_pp2d
integer :: nox
integer :: noy
real(pr) :: wx
real(pr) :: wy
integer :: nob
type(basket), &
allocatable :: bsk(:)
integer, &
allocatable :: bop(:)
integer, &
allocatable :: kop(:)
real(pr) :: ll(2), &
ww(2)
integer :: noxy(2)
contains
procedure :: ticks => make_ticks
procedure :: homogen => homogeneous_rep
procedure :: absolute => absolute_rep
procedure :: move_tiny => move_tiny_delta
procedure :: cube1 => selector_cube_1 !<------- here
procedure :: cube2 => selector_cube_2 !<------- here
end type g_pp2d
contains
subroutine init_keeper(kpr,cap)
implicit none
class(keeper), intent(inout) :: kpr
integer, intent(in) :: cap
kpr%cap = cap
kpr%noe = 0
allocate(kpr%elm(cap))
end subroutine init_keeper
function construct_keeper(cap)
implicit none
integer, intent(in) :: cap
type(keeper) :: &
construct_keeper
call construct_keeper%init(cap)
end function construct_keeper
subroutine add_to_keeper(kpr,k)
implicit none
class(keeper), intent(inout) :: kpr
integer, intent(in) :: k
integer :: kk
kk = kpr%noe + 1
if( kk>kpr%cap ) then
call kpr%ext()
end if
kpr%noe = kk
kpr%elm(kk) = k
end subroutine add_to_keeper
subroutine extend_keeper(kpr)
implicit none
class(keeper), intent(inout) :: kpr
integer, allocatable :: buf(:)
allocate( buf, source=[kpr%elm,0] )
deallocate(kpr%elm)
call move_alloc(buf,kpr%elm)
kpr%cap = kpr%cap + 1
end subroutine extend_keeper
subroutine pop_from_keeper(kpr,k,l)
implicit none
class(keeper), intent(inout) :: kpr
integer, intent(in) :: k
integer, intent(out) :: l
if( k<kpr%noe ) then
l = kpr%elm(kpr%noe)
kpr%elm(k) = l
else
l = kpr%elm(k)
end if
kpr%noe = kpr%noe - 1
end subroutine pop_from_keeper
subroutine print_keeper(kpr)
implicit none
class(keeper), intent(in) :: kpr
write(*,'(3x,i6,"/",i6,12x,100i7)') &
kpr%noe, kpr%cap, kpr%elm(1:kpr%noe)
end subroutine print_keeper
subroutine fill_random(pos, nop, lx, ly)
implicit none
class(pp2d), intent(inout) :: pos
integer, intent(in) :: nop
real(pr), intent(in) :: lx, ly
pos%nop = nop
pos%lx = lx
pos%ly = ly
allocate(pos%pos(2,0:nop-1))
call random_number(pos%pos)
pos%pos(1,:) = pos%pos(1,:)*lx
pos%pos(2,:) = pos%pos(2,:)*ly
end subroutine fill_random
subroutine load(pos, fname)
implicit none
class(pp2d), intent(inout) :: pos
character(len=*), intent(in) :: fname
integer :: u, i
open( newunit=u, file=fname )
read(u,*)
read(u,*)
read(u,*)
read(u,*) pos%nop
allocate(pos%pos(2,0:pos%nop-1))
read(u,*) pos%lx, pos%ly
do i = 0, pos%nop-1
read(u,*) pos%pos(:,i)
end do
close(u)
end subroutine load
subroutine make_ticks(pos, wth)
implicit none
class(g_pp2d), intent(inout) :: pos
real(pr), intent(in) :: wth
integer :: cx, cy, bl, at, &
i, j, k, boc
pos%nox = floor(pos%lx/wth)
pos%noy = floor(pos%ly/wth)
pos%ll = [pos%lx,pos%ly]
pos%noxy = [pos%nox-1,pos%noy-1]
pos%wx = pos%lx/pos%nox
pos%wy = pos%ly/pos%noy
pos%ww = [pos%wx,pos%wy]
pos%nob = pos%nox * pos%noy
if( .not. allocated(pos%bsk) ) then
allocate(pos%bsk(0:pos%nob-1))
allocate(pos%bop(0:pos%nop-1))
allocate(pos%kop(0:pos%nop-1))
else
deallocate(pos%bsk)
allocate(pos%bsk(0:pos%nob-1))
end if
boc = pos%nop/pos%nob + 1
do cy = 0, pos%noy-1
do cx = 0, pos%nox-1
bl = cy*pos%nox + cx
call pos%bsk(bl)%init(boc)
pos%bsk(bl)%cc = (/cx,cy/)
pos%bsk(bl)%llim = (/ cx *pos%wx, cy *pos%wy/)
pos%bsk(bl)%ulim = (/(cx+1)*pos%wx,(cy+1)*pos%wy/)
pos%bsk(bl)%su(0) = bl
do k = 1, 8
i = mod(cx+nbbsk(1,k),pos%nox)
j = mod(cy+nbbsk(2,k),pos%noy)
if(i<0) i = i + pos%nox
if(j<0) j = j + pos%noy
pos%bsk(bl)%su(k) = j*pos%nox + i
end do
end do
end do
do at = 0, pos%nop-1
cx = floor(pos%pos(1,at)/pos%wx)
cy = floor(pos%pos(2,at)/pos%wy)
bl = cy*pos%nox + cx
call pos%bsk(bl)%add(at)
pos%bop(at) = bl
pos%kop(at) = pos%bsk(bl)%noe
end do
do i= -1, 1
do j= -1, 1
med2(:,i,j) = [i,j]*pos%ww
end do
end do
end subroutine make_ticks
subroutine homogeneous_rep(pos)
implicit none
class(g_pp2d), intent(inout) :: pos
integer :: at, bl
do at = 0, pos%nop-1
bl = pos%bop(at)
pos%pos(:,at) = pos%pos(:,at) - pos%bsk(bl)%llim
end do
end subroutine homogeneous_rep
subroutine absolute_rep(pos)
implicit none
class(g_pp2d), intent(inout) :: pos
integer :: at, bl
do at = 0, pos%nop-1
bl = pos%bop(at)
pos%pos(:,at) = pos%pos(:,at) + pos%bsk(bl)%llim
end do
end subroutine absolute_rep
subroutine move_tiny_delta(pos, at, dir, delta)
implicit none
class(g_pp2d), intent(inout) :: pos
integer, intent(in) :: at, dir
real(pr), intent(in) :: delta
integer :: bl, cc(2), &
newbl, side_eff
bl = pos%bop(at)
pos%pos(dir,at) = pos%pos(dir,at) + delta
if( delta >= 0.0_pr ) then
if( pos%pos(dir,at)>=pos%ww(dir) ) then
pos%pos(dir,at) = pos%pos(dir,at) - pos%ww(dir)
newbl = pos%bsk(bl)%su(dir)
call pos%bsk(bl)%pop( pos%kop(at), side_eff )
pos%kop( side_eff ) = pos%kop(at)
call pos%bsk(newbl)%add( at )
pos%bop( at ) = newbl
pos%kop( at ) = pos%bsk(newbl)%noe
end if
else
if( pos%pos(dir,at)<0.0_pr ) then
pos%pos(dir,at) = pos%pos(dir,at) + pos%ww(dir)
newbl = pos%bsk(bl)%su(dir+2)
call pos%bsk(bl)%pop( pos%kop(at), side_eff )
pos%kop( side_eff ) = pos%kop(at)
call pos%bsk(newbl)%add( at )
pos%bop( at ) = newbl
pos%kop( at ) = pos%bsk(newbl)%noe
end if
end if
end subroutine move_tiny_delta
subroutine selector_cube_1(pos,at,wth) !<------------- here
implicit none
class(g_pp2d), intent(in) :: pos
integer, intent(in) :: at
real(pr), intent(in) :: wth
integer :: i, j, k, &
l, m, bl, bbl
real(pr) :: dr(2), c1(2), &
c2(2), delta
integer :: n, x(100) !<------------- here
n = 0
c1 = -[wth,wth]
c2 = +[wth,wth]
bl = pos%bop(at)
do i= -1, 1
do j= -1, 1
k = med1(i,j)
bbl = pos%bsk(bl)%su(k)
do l= 1, pos%bsk(bbl)%noe
m = pos%bsk(bbl)%elm(l)
dr = pos%pos(:,m) - pos%pos(:,at) + med2(:,i,j)
if( all(dr>c1) .and. all(dr<c2) ) then
delta = dr(1)*dr(1) + dr(2)*dr(2)
n = n+1
x(n) = m !<------------------ here
end if
end do
end do
end do
end subroutine selector_cube_1
subroutine selector_cube_2(pos,at,wth,x) !<----------- here
implicit none
class(g_pp2d), intent(in) :: pos
integer, intent(in) :: at
real(pr), intent(in) :: wth
integer :: i, j, k, &
l, m, bl, bbl
real(pr) :: dr(2), c1(2), &
c2(2), delta
integer :: n
integer, intent(out) :: x(100) !<---------------- here
n = 0
c1 = -[wth,wth]
c2 = +[wth,wth]
bl = pos%bop(at)
do i= -1, 1
do j= -1, 1
k = med1(i,j)
bbl = pos%bsk(bl)%su(k)
do l= 1, pos%bsk(bbl)%noe
m = pos%bsk(bbl)%elm(l)
dr = pos%pos(:,m) - pos%pos(:,at) + med2(:,i,j)
if( all(dr>c1) .and. all(dr<c2) ) then
delta = dr(1)*dr(1) + dr(2)*dr(2)
n = n+1
x(n) = m
end if
end do
end do
end do
end subroutine selector_cube_2
end module mover
program main
use mover
implicit none
type(g_pp2d) :: pos
real(pr) :: wth = 2.5_pr, delta, dmax = 0.1_pr, &
rand_pr
integer :: dir, step, steps = 10**5, &
at, i
integer(8) :: start_time, end_time, count_rate
integer :: x(100)
! initiate
call pos%fill_random(1000,20.0,20.0)
call pos%ticks(wth)
call pos%homogen()
! run
call system_clock(start_time,count_rate)
do step = 1, steps
call random_number(rand_pr)
at = floor(rand_pr*pos%nop)
call random_number(rand_pr)
if(rand_pr<0.5_pr) then
dir = 1
else
dir = 2
end if
call random_number(rand_pr)
delta = (2*rand_pr-1.0_pr) * dmax
! *********************************************
! if changed to cube2, the code drastically slows down
!-- --------------------------------------------------------------
!call pos%cube1(at,wth) !<---------------- here
call pos%cube2(at,wth, x ) !<---------------- here
!-- -------------------------------------------------------------
call pos%move_tiny(at,dir,delta)
end do
call system_clock(end_time)
call pos%absolute()
! report
write(*,'(3x,a,i7,a)') "Performance (count rate = ", count_rate, "):"
write(*,'(14x,f12.6,6x,a)') &
(real(end_time-start_time)/count_rate)*10**9/steps, &
"Nano Sec (per cycle)"
end program main