为什么我的双精度Fortran功率计算程序比浮点精度更快?

时间:2016-08-22 15:58:09

标签: performance gcc fortran

我发现功率计算a ** b比dble(a)** dble(b)快,其中a和b是浮点精度。为什么?我的编译器是

COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man
 --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla
 --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release
 --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions
 --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto
 --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install
 --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install
 --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86
_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-4) (GCC)

这是我的测试代码:

module  func
    real,parameter:: b=1.33
    real ::a(1000000)

contains  

! My test power calculation in float precision 
subroutine power(c)
    real:: c(:)
    integer :: i
    do i = 1, 1000000
        c(i) = a(i)**b
    enddo
end subroutine 

! My test power calculation in double precision (float converted into double)
subroutine hp_power(c)
    real:: c(:)
    integer :: i
    do i = 1, 1000000
        c(i) = dble(a(i))**dble(b)
    enddo
end subroutine 

end module

program compare
    use func
    implicit none
    integer :: i
    real :: c1(1000000), c2(1000000)
    real :: start_T,end_T

    !init the input array 
    do i = 1, 1000000
        a(i) = 3.!(3.+i*0.000001)
    end do

    call cpu_time(start_T)
        call power(c1)
    call cpu_time(end_T)  
    write(*,*) "The power running time is :,", end_T-start_T, 's', 'c(5):',c1(5)

    call cpu_time(start_T)
        call hp_power(c2)
    call cpu_time(end_T)  
    write(*,*) "The hp_power running time is :,", end_T-start_T, 's', 'c(5):',c2(5)

1 个答案:

答案 0 :(得分:3)

考虑到你的代码的以下改编(在intel编译器上编译),我添加了第三个例程,其中结果存储在double类型的数组中:

module  func
real,parameter:: b=1.33
real ::a(1000000)

contains  

! My test power calculation in float precision 
subroutine power(c)
real:: c(:)
integer :: i
do i = 1, 1000000
    c(i) = a(i)**b
enddo
end subroutine 

! My test power calculation in double precision (float converted into double)
subroutine hp_power(c)
real:: c(:)
integer :: i
do i = 1, 1000000
    c(i) = dble(a(i))**dble(b)
enddo
end subroutine 

! My test power calculation in double precision (float converted into double)
subroutine hp2_power(c)
real*8:: c(:)
integer :: i
do i = 1, 1000000
    c(i) = dble(a(i))**dble(b)
enddo
end subroutine 

end module

program compare
use func
implicit none
integer :: i
INTEGER :: count, count_2, count_rate, count_max
integer :: x
real c1_, c1_acc, c2_, c2_acc, c3_, c3_acc
real :: c1(1000000), c2(1000000)
real*8 :: c3(1000000)
real :: start_T,end_T

c1_acc = 0.0
c2_acc = 0.0
!init the input array 
do i = 1, 1000000
    a(i) = 3.!(3.+i*0.000001)
end do

do x = 1, 100
call system_clock(count, count_rate, count_max)
    call power(c1)
call system_clock(count_2, count_rate, count_max)  
c1_ = real(count_2-count)/count_rate
!write(*,*) "The power running time is :,", c1_, 's', 'c(5):',c1(5)
c1_acc = c1_acc + c1_

call system_clock(count, count_rate, count_max)
    call hp_power(c2)
call system_clock(count_2, count_rate, count_max)  
c2_ = real(count_2-count)/count_rate
!write(*,*) "The hp_power running time is :,", end_T-start_T, 's', 'c(5):',c2(5)
 c2_acc = c2_acc + c2_

call system_clock(count, count_rate, count_max)
    call hp2_power(c3)
call system_clock(count_2, count_rate, count_max)  
c3_ = real(count_2-count)/count_rate
!write(*,*) "The hp_power running time is :,", end_T-start_T, 's', 'c(5):',c2(5)
 c3_acc = c3_acc + c3_     
enddo

c2_ = real(c2_acc) / 100.0
c1_ = real(c1_acc) / 100.0
c3_ = real(c3_acc) / 100.0

write (*,*) c1_
write(*,*) c2_
write(*,*) c3_
end program Compare

在调试中,我得到了这样的结果:

电源 - > 2.0639971E-02

hp_power - > 2.7769983E-02

hp2_power - > 2.7449980E-02

在发布(有优化)时,这些:

电源 - > 6.7950045E-03

hp_power - > 6.8100006E-03

hp2_power - > 1.6954981E-02

正在发生的事情是,在优化时,如果结果将存储在real类型的数组中,则忽略转换。在hp2_Power中,结果将存储在real * 8(double)类型的数组中,因此此处不能应用此特定优化(比此函数的时间差异)。

我始终获得一致的结果。在调试中,power函数总是比hp_power函数快,而hp_power几乎与hp2_power相同,而在发布时,hp2_power总是很慢,而功率与hp_power非常相似。

当其他事情干扰时代时,你需要做很多重复才能确定。当我看到个别时间(来自重复)时,时间变化一点,功率和hp_power有时会完全相同。