我会在这里发布整个代码段,但唯一的问题是最后的嵌套循环。所有读入矩阵的尺寸为180x180,循环速度难以忍受。我没有看到简化计算的简单方法,因为获得矩阵“AnaInt”的索引式乘法不是简单的矩阵乘积,因为索引有三倍的出现。有什么想法吗?谢谢!
program AC
implicit none
integer, parameter :: dp = selected_real_kind(15, 307)
integer :: n, ndim, k, j, i, o, l, m, steps
real(dp) :: emax, omega, pi, EFermi, auev
complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone
complex(dp), allocatable :: GammaL(:,:)
complex(dp), allocatable :: GammaL_EB(:,:)
complex(dp), allocatable :: GammaR(:,:)
complex(dp), allocatable :: R(:,:)
complex(dp), allocatable :: Yc(:,:)
complex(dp), allocatable :: Yd(:,:)
complex(dp), allocatable :: AnaInt(:,:)
complex(dp), allocatable :: H(:,:)
complex(dp), allocatable :: HamEff(:,:)
complex(dp), allocatable :: EigVec(:,:)
complex(dp), allocatable :: InvEigVec(:,:)
complex(dp), allocatable :: EigVal(:)
complex(dp), allocatable :: ctemp(:,:)
complex(dp), allocatable :: ctemp2(:,:)
complex(dp), allocatable :: S(:,:)
complex(dp), allocatable :: SelfL(:,:)
complex(dp), allocatable :: SelfR(:,:)
complex(dp), allocatable :: SHalf(:,:)
complex(dp), allocatable :: InvSHalf(:,:)
complex(dp), allocatable :: HEB(:,:)
complex(dp), allocatable :: Integrand(:,:)
!Lapack arrays and variables
integer :: info, lwork
complex(dp), allocatable :: work(:)
real(dp), allocatable :: rwork(:)
integer,allocatable :: ipiv(:)
!########################################################################
!Constants
auev = 27.211385
pi = 3.14159265359
cunit = (0,1)
czero = (0,0)
cone = (1,0)
tinyc = (0.0, 0.000000000001)
!System and calculation parameters
open(unit=123, file="ForAC.dat", action='read', form='formatted')
read(123,*) ndim, EFermi
lwork = ndim*ndim
emax = 5.0/auev
steps = 1000
allocate(HEB(ndim,ndim))
allocate(H(ndim,ndim))
allocate(Yc(ndim,ndim))
allocate(Yd(ndim,ndim))
allocate(S(ndim,ndim))
allocate(SelfL(ndim,ndim))
allocate(SelfR(ndim,ndim))
allocate(HamEff(ndim,ndim))
allocate(GammaR(ndim,ndim))
allocate(GammaL(ndim,ndim))
allocate(AnaInt(ndim,ndim))
allocate(EigVec(ndim,ndim))
allocate(EigVal(ndim))
allocate(InvEigVec(ndim,ndim))
allocate(R(ndim,ndim))
allocate(GammaL_EB(ndim,ndim))
allocate(Integrand(ndim,ndim))
!################################################
read(123,*) H, S, SelfL, SelfR
close(unit=123)
HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))
allocate(SHalf(ndim, ndim))
allocate(InvSHalf(ndim,ndim))
SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))
call zpotrf('l', ndim, SHalf, ndim, info)
InvSHalf(:,:) = SHalf(:,:)
call ztrtri('l', 'n', ndim, InvSHalf, ndim, info)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
deallocate(SHalf)
deallocate(InvSHalf)
!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal
allocate(ctemp(ndim,ndim))
ctemp(:,:) = HamEff(:,:)
allocate(work(lwork),rwork(2*ndim))
call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
if(info/=0)write(*,*) "Warning: zgeev info=", info
deallocate(work,rwork)
deallocate(ctemp)
InvEigVec(:,:)=EigVec(:,:)
lwork = 3*ndim
allocate(ipiv(ndim))
allocate(work(lwork))
call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
if(info/=0)write(*,*) "Warning: zgetrf info=", info ! LU decomposition
call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
deallocate(work)
deallocate(ipiv)
R(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
do k=1,ndim
do l=1,ndim
R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
end do
end do
end do
end do
!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000
open(unit=125,file="ACCond.dat")
!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do i=1,ndim
do n=1,ndim
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
end do
end do
end do
end do
Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)
ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do
!#############################################
deallocate(Integrand)
deallocate(HEB)
deallocate(Yc)
deallocate(Yd)
deallocate(HamEff)
deallocate(GammaR)
deallocate(GammaL)
deallocate(AnaInt)
deallocate(EigVec)
deallocate(EigVal)
deallocate(InvEigVec)
deallocate(H)
deallocate(S)
deallocate(SelfL)
deallocate(SelfR)
deallocate(R)
deallocate(GammaL_EB)
end program AC
所以,这是根据建议进行的第一次调整:
HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do
HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do
R(:,:) = 0.0_dp
R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))
open(unit=125,file="ACCond.dat")
!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
T(j,m) = R(j,m) * Integrand(j,m)
end do
end do
AnaInt = matmul(EigVec,matmul(T,HermEigVec))
Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)
ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do
答案 0 :(得分:2)
您的代码中存在几个问题。 让我们从你强调的那个循环开始(它更容易理解,但是下面的大循环或多或少有相同的问题)。
所以我们在i,j,k,l上有一个循环。
您可以考虑重新排序循环,以获得更好的缓存访问权限。您最内部的循环在l上,它只显示为列索引。使用Fortran中的column-major数组,您可能会遇到性能不佳的情况。 j上的内循环可能会更好。
更糟糕的是,你的整个循环是由三个矩阵(InvEigVec * GammaR * InvEigVec ^ H)的乘积进行矩阵更新,但是你在O(ndim ^ 4)中完成。每个矩阵乘积为O(n ^ 3)(如果使用ZGEMM调用优化Strassen algorithm,则可能更少)。因此,通过存储矩阵产品,两个产品应该是O(n ^ 3),而不是O(n ^ 4)。 也就是说,你可以做矩阵产品,然后是矩阵产品更新。
现在,你的大循环:步骤次超过i,n,j,m。
如果我读得好,你就写了
Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
右侧的所有变量都是标量,但Integrand是ndim * ndim矩阵。在多个地方复制一个值的工作很多。 但是你在Integrand上循环,你可以只使用一个标量。或许这是一个错误,你应该在左手边有Integrand(j,m)或类似的东西?
然后,你的四个内部循环就像前面的评论一样,更新了 AnaInt with array product EigVec *(R。* Integrand)* EigVec ^ H,with。*(term by term)数组的标量积(或者只有EigVec * R * EigVec ^ H,如果Integrand只是一个标量)。
同样,尝试用ZGEMM编写它可能会很好,从而降低了复杂性。
答案 1 :(得分:1)
您是否考虑过使用OPENMP对循环进行并行化?它很容易实现。如果有兴趣,我可以给你一些提示。
试着看看这里:openMP DO tutorial