我使用gfortran -O -mcmodel=medium test.f90 -lfftw3_omp -lfftw3 -lm -qopenmp
编译了以下程序,并将fftw3.f
添加到/usr/lib
。我使用ulimit -s unlimited to run ./out
因为我得到了堆栈溢出(Why Segmentation fault is happening in this openmp code?)。
program main
Implicit none
include 'omp_lib.h'
include 'fftw3.f'
Integer i, j, k, iter
Integer,Parameter :: Nx =381
Integer,Parameter :: Ny =129
Integer,Parameter :: Nz =129
Integer, parameter :: N1 = Ny-1
Integer, parameter :: N2 = Nz-1
double precision in
dimension in(N1,N2)
double complex out
dimension out(N1/2 + 1, N2)
integer*8 plan
real*8 U(-2:Nx+2,-2:Ny+2,-2:Nz+2)
real*8 V(-2:Nx+2,-2:Ny+2,-2:Nz+2)
real*8 W(-2:Nx+2,-2:Ny+2,-2:Nz+2)
Real*8 ky_vis(0:Ny)
Real*8 kz_vis(0:Nz)
Complex*16 F_F1(-2:Nx+2,-2:Ny+2,-2:Nz+2)
Real*8,Parameter :: pi = 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825d0
Real*8,Parameter :: Ly = 1.0d0 * 2.0d0 * pi
Real*8,Parameter :: Lz = 1.0d0 * 2.0d0 * pi
Real*8 c1_Nfft
Integer,Parameter :: NyFFT = (Ny-1 + 2) / 2
c1_Nfft = 1.0d0 / dsqrt( Dble(Ny-1) * Dble(Nz-1) )
U= reshape((/(i, i=1,(Nx+5)*(Ny+5)*(Nz+5))/),shape(U))
V = reshape((/(i, i=1,(Nx+5)*(Ny+5)*(Nz+5))/),shape(V))
W = reshape((/(i, i=1,(Nx+5)*(Ny+5)*(Nz+5))/),shape(W))
!$OMP PARALLEL DO PRIVATE(j) SHARED(ky_vis)
Do j = 1, (Ny-1)/2+1
ky_vis(j) = ( 2.0d0 * pi /Ly ) * Dble( j-1 )
End Do
!$OMP End PARALLEL DO
!$OMP PARALLEL DO PRIVATE(j) SHARED(ky_vis)
Do j=(Ny-1)/2+2,Ny-1
ky_vis(j) = ( 2.0d0 * pi /Ly ) * Dble((Ny-1)-(j-1))
end do
!$OMP End PARALLEL DO
!$OMP PARALLEL DO PRIVATE(k) SHARED(kz_vis)
Do k = 1, (Nz-1)/2+1
kz_vis(k) = ( 2.0d0 * pi /Lz ) * Dble( k-1 )
End Do
!$OMP End PARALLEL DO
!$OMP PARALLEL DO PRIVATE(j) SHARED(ky_vis)
Do k=(Nz-1)/2+2,Nz-1
kz_vis(k) = ( 2.0d0 * pi /Lz ) * Dble((Nz-1)-(k-1))
end do
!$OMP End PARALLEL DO
!-----------FFTW----------------
Do i = 1, Nx-1 ! --- \8F\87\95ϊ\B7 ---
! !$OMP PARALLEL DO PRIVATE(i,j,k) SHARED(Real_F1)
Do k = 1, Nz-1
Do j = 1, Ny-1
in(j,k) = U(i,j,k)
End Do; End Do
! !$OMP END PARALLEL DO
call dfftw_plan_dft_r2c_2d(plan,N1,N2,in,out,FFTW_ESTIMATE)
call dfftw_execute_dft_r2c(plan, in, out)
call dfftw_destroy_plan(plan)
! !$OMP PARALLEL DO PRIVATE(j,k) SHARED(F_F1)
Do j = 1, NyFFT
Do k = 1, Nz-1
F_F1(i,j,k) = out(j,k) * c1_Nfft
End Do; End Do
! !$OMP END PARALLEL DO
End Do
!-----------IFFTW----------------
Do i = 1, Nx-1 !
Do j = 1, NyFFT
Do k = 1, Nz-1
out(j,k) = - ( ky_vis(j) * ky_vis(j) &
+ kz_vis(k) * kz_vis(k) ) * F_F1(i,j,k)
End Do; End Do
call dfftw_plan_dft_c2r_2d(plan,N1,N2,out,in,FFTW_ESTIMATE)
call dfftw_execute_dft_c2r(plan, out, in)
call dfftw_destroy_plan(plan)
! !$OMP PARALLEL DO PRIVATE(j,k) SHARED(ddUdx)
Do k = 1, Nz-1
Do j = 1, Ny-1
U(i,j,k) = U(i,j,k) + in(j,k) * c1_Nfft
End Do; End Do
! !$OMP END PARALLEL DO
End Do
end program
我使用system_clock
检查FFTW执行时间,发现实际上计算速度太慢。有没有更好的方法让它运行得更快?我在Ubuntu 17.04上使用gfortran,CPU是AMD1950x,内存是64G。
答案 0 :(得分:0)
首先,正如我之前在评论中指出的那样,你应该从不反复做
do ... !your loop
call dfftw_plan_dft_r2c_2d(plan,N1,N2,in,out,FFTW_ESTIMATE)
call dfftw_execute_dft_r2c(plan, in, out)
call dfftw_destroy_plan(plan)
end do
在循环中多次提供N1
和N2
是不变的。 FFTW_ESTIMATE
并不是一场大灾难,但它仍然很慢。您应该将计划一次并重新使用。创建FFTW计划很慢。
call dfftw_plan_dft_r2c_2d(plan,N1,N2,in,out,FFTW_ESTIMATE)
do .... !your loop
call dfftw_execute_dft_r2c(plan, in, out)
end do
call dfftw_destroy_plan(plan)
其次,您必须首先指示FFTW使用OpenMP线程。这一切都在手册中,我将在我的代码中展示我是如何做到的:
use iso_c_binding
integer(c_int) :: nthreads
integer(c_int) :: error
!$ nthreads = omp_get_num_threads()
error = fftw_init_threads()
if (error==0) then
write(*,*) "Error when initializing FFTW for threads."
else
call fftw_plan_with_nthreads(nthreads)
end if
这使用现代的FFTW Fortran接口,而不是您传统的Fortran 90接口(是的Fortran 90已经过时了)。但它在遗留界面中应该相当类似,例如
integer :: nthreads, error
!$ nthreads = omp_get_num_threads()
call dfftw_init_threads(error)
call dfftw_plan_with_nthreads(nthreads)