我在Fortran 90中为矩阵产品状态编写了一个程序。在这个程序中,我使用的是英特尔MKL库。当编译器选项为:
时OPTIMIZE= -parallel -par-threshold90 -ipo -O3 -no-prec-div -fp-model fast=2 -xHost
LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl -static-intel,
在运行时,内存不断增加。 如果我将编译器选项更改为:
OPTIMIZE= -ipo -O3 -no-prec-div -fp-model fast=2 -xHost
LinkLine = $(OPTIMIZE) $(DIAGNOSE) -mkl=sequential -static-intel,
没有问题,一切都很好。 为什么会这样?这有什么解决方案吗?
按照Alexander的建议,我在下面发布了一个子程序(对矩阵进行奇异值分解):
SUBROUTINE SVD(Theta,S1,S2,lambda,Din,Dout)
IMPLICIT NONE
INTEGER, INTENT(IN) :: Din, Dout
COMPLEX(kind=rKind),DIMENSION(Din,Din), INTENT(IN) :: Theta
REAL(kind=rKind), INTENT(OUT) :: lambda(Dout)
COMPLEX(kind=rKind), INTENT(OUT) :: S1(Din,Dout),S2(Din,Dout)
COMPLEX(kind=rKind) :: M(Din,Din)
REAL(kind=rKind) :: temp_lam(Din)
COMPLEX(kind=rKind) :: temp_U(Din,Din), temp_V(Din,Din)
COMPLEX(kind=rKind) :: WORK(10*Din)
REAL(kind=rKind) :: RWORK(5*Din)
REAL(kind=rKind), PARAMETER :: Truncation = 1E-13_rKind
INTEGER :: INFO, i
!----------------------------
M = Theta
CALL ZGESVD('A','A',Din,Din,M,Din,temp_lam,temp_U,Din,temp_V,Din,WORK,10*Din,RWORK,INFO)
lambda = 0.0_rKind
S1 = 0.0_rKind
S2 = 0.0_rKind
DO i = 1,Dout
IF (temp_lam(i)/temp_lam(1)>Truncation) THEN
lambda(i) = temp_lam(i)
S1(:,i) = temp_U(:,i)
S2(:,i) = temp_V(i,:)
END IF
END DO
END SUBROUTINE SVD
另一个子程序是
SUBROUTINE TwoSiteUpdate(A_update,B_update,lambda_2_update,&
& A,B,lambda_1,lambda_2,U,D1,D2_old,D2_new)
! Designed for 1D MPS.
IMPLICIT NONE
INTEGER, INTENT(IN) :: D1, D2_old, D2_new
COMPLEX(kind=rKind), INTENT(IN) :: A(D1,D2_old,localSize), B(D2_old,D1,localSize)
REAL(kind=rKind), INTENT(IN) :: lambda_1(D1), lambda_2(D2_old)
COMPLEX(kind=rKind), INTENT(IN) :: U(localSize,localSize,localSize,localSize)
COMPLEX(kind=rKind), INTENT(OUT) :: A_update(D1,D2_new,localSize), B_update(D2_new,D1,localSize)
REAL(kind=rKind), INTENT(OUT) :: lambda_2_update(D2_new)
COMPLEX(kind=rKind) :: temp_A(D1,D2_old,localSize)
COMPLEX(kind=rKind) :: temp_B(D2_old,D1,localSize)
COMPLEX(kind=rKind) :: Theta(D1,localSize,D1,localSize)
COMPLEX(kind=rKind) :: Theta_S(D1*localSize,D1*localSize)
COMPLEX(kind=rKind) :: S1(D1*localSize,D2_new), S2(D1*localSize,D2_new)
COMPLEX(kind=rKind) :: S1_tmp(D1,localSize,D2_new)
COMPLEX(kind=rKind) :: S2_tmp(D1,localSize,D2_new)
REAL(kind=rKind), PARAMETER :: Truncation=1E-16_rKind
COMPLEX(kind=rKind) :: tmp_ele
INTEGER :: i1,i2,i3,im
INTEGER :: mA1,mB1,mA2,mB2
INTEGER :: i
!------------------- add lambda on A, B ----------------------
temp_A = 0.0_rKind
temp_B = 0.0_rKind
DO im = 1,localSize
DO i2 = 1,D2_old
DO i1 = 1,D1
IF(lambda_1(i1)>Truncation) THEN
temp_A(i1,i2,im) = A(i1,i2,im)*SQRT(lambda_1(i1))
temp_B(i2,i1,im) = B(i2,i1,im)*SQRT(lambda_1(i1))
END IF
END DO
END DO
END DO
!--------------- svd on Theta ----------------
Theta = 0.0_rKind
DO mB2 = 1,localSize
DO i2 = 1,D1
DO mA2 = 1,localSize
DO i1 = 1,D1
tmp_ele = 0.0_rKind
DO mB1 = 1,localSize
DO mA1 = 1,localSize
DO i3 = 1,D2_old
tmp_ele = tmp_ele + U(mA2,mB2,mA1,mB1)*temp_A(i1,i3,mA1)*temp_B(i3,i2,mB1)
END DO
END DO
END DO
Theta(i1,mA2,i2,mB2) = tmp_ele
END DO
END DO
END DO
END DO
Theta_S = RESHAPE(Theta,SHAPE=(/D1*localSize,D1*localSize/))
CALL SVD(Theta_S,S1,S2,lambda_2_update,D1*localSize,D2_new)
lambda_2_update = lambda_2_update/lambda_2_update(1)
S1_tmp = RESHAPE(S1,SHAPE=(/D1,localSize,D2_new/))
S2_tmp = RESHAPE(S2,SHAPE=(/D1,localSize,D2_new/))
!---------------- update A, B ----------------
A_update = 0.0_rKind
B_update = 0.0_rKind
DO im = 1,localSize
DO i2 = 1,D2_new
DO i1 = 1,D1
IF (lambda_1(i1)>Truncation) THEN
A_update(i1,i2,im) = S1_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
B_update(i2,i1,im) = S2_tmp(i1,im,i2)*SQRT(lambda_2_update(i2))/SQRT(lambda_1(i1))
END IF
END DO
END DO
END DO
END SUBROUTINE TwoSiteUpdate
以上两个子程序是程序的耗时部分。没有这两个子程序,程序就可以了,内存不会增加。有了这两个,当我使用并行版本时,内存会增加。
答案 0 :(得分:0)
您可以检查是否在循环中使用了带可分配项的复合类型。您也可以使用OpenMP并行指令和-openmp编译选项来手动控制并行的位置,而不是自动并行,如下所示
$omp parallel do
do i = 1, n
...
$ ifort -openmp foo.f90