Question

我为Fortran中的快速xoroshiro128plus pseudo-random number generator写了一个最小的实现，以替换固有的random_number。此实现速度非常快（比random_number快4倍），并且质量足以满足我的目的，我不在加密应用程序中使用它。

我的问题是我该如何优化此子例程，以使编译器获得最后的性能下降，甚至可以提高10％。该子例程将用于长时间模拟中的紧密循环中。我对一次生成一个随机数而不是一次生成大向量或nD数组感兴趣。

这是一个测试程序，可为您提供有关如何使用我的子例程的上下文：

program test_xoroshiro128plus
   implicit none
   integer, parameter :: n = 10000
   real*8  :: A(n,n)
   integer :: i, j, t0, t1, count_rate, count_max

   call system_clock(t0, count_rate, count_max)
   do j = 1,n
      do i = 1,n
         call drand128(A(i,j))
      end do
   end do
   ! call drand128(A)  ! works also with 2D 
   call system_clock(t1)

   print *, "Time :", real(t1-t0)/count_rate
   print *, "Mean :", sum(A)/size(A), char(10), A(1:2,1:3)

 contains

   impure elemental subroutine drand128(r)
      real*8, intent(out) :: r
      integer*8 :: s0 = 113, s1 = 19937
      s1 = xor(s0,s1)
      s0 = xor(xor(ior(ishft(s0,55), ishft(s0,-9)),s1), ishft(s1,14))
      s1 = ior(ishft(s1,36), ishft(s1,-28))
      r = ishft(s0+s1, -1) / 9223372036854775808.d0
   end 

end program

Answer 1

直到现在，我才意识到您在问这个特殊的PRNG。我自己https://bitbucket.org/LadaF/elmm/src/eb5b54b9a8eb6af158a38038f72d07865fe23ee3/src/rng_par_zig.f90?at=master&fileviewer=file-view-default

在Fortran中使用它

我在链接中的代码比您的代码慢，因为它调用了几个子例程并旨在变得更加通用。让我们尝试将我使用的代码压缩到一个子例程中。

因此，我们只比较代码的性能和@SeverinPappadeux的优化版本以及我使用Gfortran 4.8.5的优化代码

> gfortran -cpp -O3 -mtune=native xoroshiro.f90 

 Time drand128 sub:   1.80900002    
 Time drand128 fun:   1.80900002    
 Time rng_uni:   1.32900000

代码在这里，请记住让CPU加速运行，k循环的第一次迭代只是垃圾！！

program test_xoroshiro128plus
   use iso_fortran_env       
   implicit none
   integer, parameter :: n = 30000
   real*8  :: A(n,n)
   real*4  :: B(n,n)
   integer :: i, j, k, t0, t1, count_rate, count_max       

   integer(int64) :: s1 = int(Z'1DADBEEFBAADD0D0', int64), s2 = int(Z'5BADD0D0DEADBEEF', int64)

!let the CPU spin-up                                           
do k = 1, 3                                           
   call system_clock(t0, count_rate, count_max)
   do j = 1,n
      do i = 1,n
         call drand128(A(i,j))
      end do
   end do
   ! call drand128(A)  ! works also with 2D 
   call system_clock(t1)

   print *, "Time drand128 sub:", real(t1-t0)/count_rate

   call system_clock(t0, count_rate, count_max)
   do j = 1,n
      do i = 1,n
         A(i,j) = drand128_fun()
      end do
   end do
   ! call drand128(A)  ! works also with 2D 
   call system_clock(t1)

   print *, "Time drand128 fun:", real(t1-t0)/count_rate


   call system_clock(t0, count_rate, count_max)
   do j = 1,n
      do i = 1,n
         call rng_uni(A(i,j))
      end do
   end do
   call system_clock(t1)

   print *, "Time rng_uni:", real(t1-t0)/count_rate
end do

   print *, "Mean :", sum(A)/size(A), char(10), A(1:2,1:3)

 contains

   impure elemental subroutine drand128(r)
      real*8, intent(out) :: r
      integer*8 :: s0 = 113, s1 = 19937
      s1 = xor(s0,s1)
      s0 = xor(xor(ior(ishft(s0,55), ishft(s0,-9)),s1), ishft(s1,14))
      s1 = ior(ishft(s1,36), ishft(s1,-28))
      r = ishft(s0+s1, -1) / 9223372036854775808.d0
   end 

   impure elemental real*8 function drand128_fun()
     real*8, parameter :: c = 1.0d0/9223372036854775808.d0
     integer*8 :: s0 = 113, s1 = 19937
     s1 = xor(s0,s1)
     s0 = xor(xor(ior(ishft(s0,55), ishft(s0,-9)),s1), ishft(s1,14))
     s1 = ior(ishft(s1,36), ishft(s1,-28))
     drand128_fun = ishft(s0+s1, -1) * c
  end

  impure elemental subroutine rng_uni(fn_val)
    real(real64), intent(inout) ::  fn_val
    integer(int64) :: ival

    ival = s1 + s2

    s2 = ieor(s2, s1)
    s1 = ieor( ieor(rotl(s1, 24), s2), shiftl(s2, 16))
    s2 = rotl(s2, 37)    

    ival  = ior(int(Z'3FF0000000000000',int64), shiftr(ival, 12))
    fn_val = transfer(ival, 1.0_real64) - 1;    
  end subroutine

  function rotl(x, k)
    integer(int64) :: rotl
    integer(int64) :: x
    integer :: k

    rotl = ior( shiftl(x, k), shiftr(x, 64-k))
  end function    

end program

主要区别应该在于更快，更好的方式从整数转换为实数http://experilous.com/1/blog/post/perfect-fast-random-floating-point-numbers#half-open-range

如果您觉得无聊，可以尝试手动内联rotl()，但在这里我相信编译器。

Answer 2

好的，这是我的尝试。首先，我实现了它的功能-在x64或类似的ABI函数中返回浮点值在寄存器中执行-比参数传递要快得多。第二，用乘法代替了最后的除法，尽管英特尔编译器可能会为您做。

计时，Intel i7 6820，WSL，Ubuntu 18.04：

before -   0.850000024
after  -   0.601000011

GNU Fortran 7.3.0，命令行

gfortran -std=gnu -O3 -ffast-math -mavx2 /mnt/c/Users/kkk/Documents/CPP/a.for

代码

  program test_xoroshiro128plus
  implicit none
  integer, parameter :: n = 10000
  real*8  :: A(n,n)
  integer :: i, j, t0, t1, count_rate, count_max

  call system_clock(t0, count_rate, count_max)
  do j = 1,n
     do i = 1,n
        A(i,j) = drand128()
     end do
  end do
  A = drand128()  ! works also with 2D
  call system_clock(t1)

  print *, "Time :", real(t1-t0)/count_rate
  print *, "Mean :", sum(A)/size(A), char(10), A(1:2,1:3)

  contains

  impure elemental real*8 function drand128()
     real*8, parameter :: c = 1.0d0/9223372036854775808.d0
     integer*8 :: s0 = 113, s1 = 19937
     s1 = xor(s0,s1)
     s0 = xor(xor(ior(ishft(s0,55), ishft(s0,-9)),s1), ishft(s1,14))
     s1 = ior(ishft(s1,36), ishft(s1,-28))
     drand128 = ishft(s0+s1, -1) * c
  end

  end program

优化Fortran子例程

2 个答案: