我正在进行一个仿真,其中瓶颈正在执行大量复杂的双精度矩阵指数,并且发现尽管Fortran(Expokit)在小型矩阵上运行良好,但对于较大的矩阵,其性能却不如Matlab或Python
尽管其中需要更大的矩阵来显示性能差异,但是我在下面包含了一个显示类似行为的模型程序。查看探查器和source code,似乎Expokit大部分时间都花在调用zgemm()上,所以我唯一的想法是我的BLAS安装有问题。否则我不明白为什么Fortran的性能会比Matlab或Python差。对于提高Fortran矩阵指数代码性能的任何见解,我将不胜感激。
10000个矩阵(4x4、8x8、30x30、60x60、80x80)的结果:
Matlab:0.91、0.97、2.36、5.45、8.69
Python(s):2.59、2.89、9.70、35.4、72.7
Fortran,Expokit:0.037、0.12、4.14、30.6、74.9
具有8个内核的Fortran,Expokit,OpenMP:0.0039、0.016、0.52、3.87, 9.53
Fortran代码:
subroutine expokit_test()
use omp_lib
use iso_fortran_env
implicit none
integer, parameter :: wp = selected_real_kind(15, 307), size=80
complex(wp), parameter :: i = (0, 1._wp)
integer :: count, a, b
real(wp) :: wtime
complex(wp) :: mat_exp(size, size), mat(size, size), val
val = 1E-8_wp
mat = 0._wp
do a = 1, size
do b = 1, size
mat(a, b) = a * b
end do
end do
call omp_set_num_threads(8)
wtime = omp_get_wtime()
!$omp parallel do default(private) &
!$omp& shared(mat, val)
do count = 1, int(1E4)
mat_exp = expm_complex(-i * mat * val)
end do
!$omp end parallel do
wtime = omp_get_wtime () - wtime
write(6, *) 'expm_complex', sngl(wtime)
end subroutine expokit_test
function expm_complex(A) result(B)
! Calculate matrix exponential of complex matrix A using Expokit
use iso_fortran_env
implicit none
integer, parameter :: wp = selected_real_kind(15, 307)
complex(wp), dimension(:, :), intent(in) :: A
complex(wp), dimension(size(A, 1), size(A, 2)) :: B
integer, parameter :: ideg = 2 ! Pade approximation, 6 is reccomended but 2 appears to be stable
complex(wp) :: t = 1._wp
complex(wp), dimension(4 * size(A, 1) * size(A, 2) + ideg + 1) :: wsp
integer, dimension(size(A, 1)) :: iwsp
integer :: iexp, ns, iflag, n
n = size(A, 1)
call ZGPADM(ideg, n, t, A, n, wsp, size(wsp, 1), iwsp, iexp, ns, iflag)
B = reshape(wsp(iexp : iexp + n * n - 1), [n, n])
end function expm_complex
Matlab代码:
size = 80;
for a=1:size
for b=1:size
A(a, b) = 1E-9 + (a * b);
end
end
tic
for test=1:1E4
t=expm(-1i*A*1E-8);
end
toc
Python代码:
size = 80
mat = np.ones((size, size))
for a in range(0, size):
for b in range(0, size):
mat[a, b] = ((a+1) * (b+1))
mat = mat + 1E-9
start = time.time()
for loop in range(0, int(1E4)):
test = la.expm(-1j * mat * 1E-8)
end = time.time() - start
print('time taken', end)