我想通过使用omp指令加速子程序平流中的雅可比计算。我的不成功的尝试如下所示。标记为第1至4部分的四个计算彼此独立。请注意,我不是试图使用openmp来加速每个fftw调用。该程序编译,但如果nx和ny是> 128,程序抛出'Segmentation fault(core dumped)'。根据我的计算,所需的内存是32 * 8 * nx * ny(以字节为单位),处理器有16 GB。
我在测试程序中包含了Advection,以增加问题的清晰度: 如何将程序组织成可以并发运行的单独部分?
!! to compile: gfortran -fopenmp -I/usr/include -o Omp_Advection Omp_Advection.f90 -lfftw3
Module FFTW3
use, intrinsic :: iso_c_binding
include 'fftw3.f03'
End Module FFTW3
Module NS
use FFTW3
! constants
real (kind=c_double), parameter :: pi=3.1415926535897932d0
complex (kind=c_double_complex) :: i=complex(0.d0,1.d0)
! Variables and their FT
integer (kind=c_int), parameter :: nx=128,ny=128 ! even!
real (kind=c_double) :: omega(ny,nx),psi(ny,nx)
complex (kind=c_double_complex) :: fftomega(ny,nx),fftpsi(ny,nx)
complex (kind=c_double_complex) :: fftadvect(ny,nx)
! Operators
complex (kind=c_double_complex) :: Dx(nx,ny),Dy(ny,nx)
real (kind=c_double) :: laplacian(ny,nx),ilaplacian(ny,nx)
! needed for FFTW
type (c_ptr) :: fwd,bak
real (kind=c_double) :: norml=1.d0/dfloat(nx*ny)
contains
Subroutine Make_Data
implicit none
integer(kind=c_int) :: kx,ky
real (kind=c_double) :: lcdx(nx),d2x(nx),ks(nx/2+1),harvest(ny,nx)
real (kind=c_double),allocatable :: t(:),x(:),offset(:)
complex (kind=c_double_complex),allocatable :: work(:,:)
! keep the factor i out until Dx and Dy
ks=(/(dfloat(kx-1),kx=1,nx/2+1)/)
t=(/(ks(kx),kx=nx/2,2,-1)/)
lcdx=(/ks,-t/)
d2x=-lcdx*lcdx
Dx=i*spread(lcdx,1,nx)
Dy=i*spread(lcdx,2,ny)
laplacian=spread(d2x,2,nx)+spread(d2x,1,ny)
ilaplacian=1./laplacian
ilaplacian(1,1)=0.d0
! make omega and compute psi, x and y are equal
x=2.0d0*pi*(/(dfloat(kx-1)/(dfloat(nx)),kx=1,nx+1)/)-pi
x=x(1:nx) ! this is y as well
do kx=1,nx
offset=-pi/3.0+2.d-2*cos(12.d0*x)
omega(kx,:)=1.d0/dcosh(30.d0*(x(kx)-offset))**2&
&-1.d0/dcosh(30.d0*(x(kx)+offset))**2
end do
work=omega+i*0.d0
call dfftw_execute_dft(fwd,work,fftomega)
fftpsi=fftomega*ilaplacian
call dfftw_execute_dft(bak,fftpsi,work)
psi=norml*real(work)
End Subroutine Make_Data
Subroutine Advection (ftpsi,ftome)
implicit none
real(c_double) :: Dx_psi(ny,nx),Dy_psi(ny,nx)
real(c_double) :: Dx_omega(ny,nx),Dy_omega(ny,nx)
complex (kind=c_double_complex),intent(in) :: ftome(:,:),ftpsi(:,:)
complex (kind=c_double_complex) :: D1(ny,nx),D2(ny,nx)
complex (kind=c_double_complex) :: D3(ny,nx),D4(ny,nx)
complex (kind=c_double_complex) :: D5(ny,nx),D6(ny,nx)
complex (kind=c_double_complex) :: D7(ny,nx),D8(ny,nx)
!$omp parallel
!$omp sections
!$omp section
! section 1
D1=Dx*ftpsi
call dfftw_execute_dft(bak,D1,D2)
Dx_psi=norml*real(D2)
!$omp section
! section 2
D3=Dy*ftpsi
call dfftw_execute_dft(bak,D3,D4)
Dy_psi=norml*real(D4)
!$omp section
! section 3
D5=Dx*ftome
call dfftw_execute_dft(bak,D5,D6)
Dx_omega=norml*real(D6)
!$omp section
! section 4
D7=Dy*ftome
call dfftw_execute_dft(bak,D7,D8)
Dy_omega=norml*real(D8)
! end sections
!$omp end sections
!$omp workshare
D1 = Dy_psi*Dx_omega - Dx_psi*Dy_omega+0.d0*complex(0.d0,0.d0);
!$omp end workshare
!$omp end parallel
call dfftw_execute_dft(fwd,D1,fftadvect)
End Subroutine Advection
End Module NS
Program Main
use FFTW3
use NS
integer :: kx
! Prep FFTW
call dfftw_plan_dft_2d(fwd,ny,nx,fftomega,fftpsi,FFTW_FORWARD,FFTW_MEASURE)
call dfftw_plan_dft_2d(bak,ny,nx,fftomega,fftpsi,FFTW_BACKWARD,FFTW_MEASURE)
call Make_Data
do kx=1,100
call Advection(fftpsi,fftomega)
end do
End Program Main