Question

我想通过使用omp指令加速子程序平流中的雅可比计算。我的不成功的尝试如下所示。标记为第1至4部分的四个计算彼此独立。请注意，我不是试图使用openmp来加速每个fftw调用。该程序编译，但如果nx和ny是> 128，程序抛出'Segmentation fault（core dumped）'。根据我的计算，所需的内存是32 * 8 * nx * ny（以字节为单位），处理器有16 GB。

我在测试程序中包含了Advection，以增加问题的清晰度：如何将程序组织成可以并发运行的单独部分？

!! to compile: gfortran -fopenmp -I/usr/include -o Omp_Advection Omp_Advection.f90 -lfftw3

Module FFTW3
  use, intrinsic :: iso_c_binding
  include 'fftw3.f03'
End Module FFTW3

Module NS
  use FFTW3
  ! constants
  real (kind=c_double), parameter :: pi=3.1415926535897932d0
  complex (kind=c_double_complex) :: i=complex(0.d0,1.d0)
  ! Variables and their FT
  integer (kind=c_int), parameter :: nx=128,ny=128 ! even!
  real (kind=c_double) :: omega(ny,nx),psi(ny,nx)
  complex (kind=c_double_complex) :: fftomega(ny,nx),fftpsi(ny,nx)
  complex (kind=c_double_complex) :: fftadvect(ny,nx)
  ! Operators
  complex (kind=c_double_complex) :: Dx(nx,ny),Dy(ny,nx)
  real (kind=c_double) :: laplacian(ny,nx),ilaplacian(ny,nx)
  ! needed for FFTW
  type (c_ptr) :: fwd,bak
  real (kind=c_double) :: norml=1.d0/dfloat(nx*ny)
  contains

    Subroutine Make_Data
      implicit none
      integer(kind=c_int) :: kx,ky
      real (kind=c_double) :: lcdx(nx),d2x(nx),ks(nx/2+1),harvest(ny,nx)
      real (kind=c_double),allocatable :: t(:),x(:),offset(:)
      complex (kind=c_double_complex),allocatable :: work(:,:)
      ! keep the factor i out until Dx and Dy
      ks=(/(dfloat(kx-1),kx=1,nx/2+1)/)
      t=(/(ks(kx),kx=nx/2,2,-1)/)
      lcdx=(/ks,-t/)           
      d2x=-lcdx*lcdx
      Dx=i*spread(lcdx,1,nx)
      Dy=i*spread(lcdx,2,ny)
      laplacian=spread(d2x,2,nx)+spread(d2x,1,ny)
      ilaplacian=1./laplacian
      ilaplacian(1,1)=0.d0
      ! make omega and compute psi, x and y are equal
      x=2.0d0*pi*(/(dfloat(kx-1)/(dfloat(nx)),kx=1,nx+1)/)-pi
      x=x(1:nx)                 ! this is y as well
      do kx=1,nx
         offset=-pi/3.0+2.d-2*cos(12.d0*x)         
         omega(kx,:)=1.d0/dcosh(30.d0*(x(kx)-offset))**2&
              &-1.d0/dcosh(30.d0*(x(kx)+offset))**2
      end do
      work=omega+i*0.d0
      call dfftw_execute_dft(fwd,work,fftomega)
      fftpsi=fftomega*ilaplacian
      call dfftw_execute_dft(bak,fftpsi,work)
      psi=norml*real(work)
    End Subroutine Make_Data

    Subroutine Advection (ftpsi,ftome)
      implicit none
      real(c_double) :: Dx_psi(ny,nx),Dy_psi(ny,nx)
      real(c_double) :: Dx_omega(ny,nx),Dy_omega(ny,nx)
      complex (kind=c_double_complex),intent(in) :: ftome(:,:),ftpsi(:,:)
      complex (kind=c_double_complex) :: D1(ny,nx),D2(ny,nx)
      complex (kind=c_double_complex) :: D3(ny,nx),D4(ny,nx)
      complex (kind=c_double_complex) :: D5(ny,nx),D6(ny,nx)
      complex (kind=c_double_complex) :: D7(ny,nx),D8(ny,nx)
!$omp parallel
!$omp sections
!$omp section
      ! section 1
      D1=Dx*ftpsi
      call dfftw_execute_dft(bak,D1,D2)
      Dx_psi=norml*real(D2)
!$omp section
      ! section 2
      D3=Dy*ftpsi
      call dfftw_execute_dft(bak,D3,D4)
      Dy_psi=norml*real(D4)
!$omp section
      ! section 3
      D5=Dx*ftome
      call dfftw_execute_dft(bak,D5,D6)
      Dx_omega=norml*real(D6)
!$omp section
      ! section 4
      D7=Dy*ftome
      call dfftw_execute_dft(bak,D7,D8)
      Dy_omega=norml*real(D8)
      ! end sections
!$omp end sections
!$omp workshare
      D1 = Dy_psi*Dx_omega - Dx_psi*Dy_omega+0.d0*complex(0.d0,0.d0);
!$omp end workshare
!$omp end parallel
      call dfftw_execute_dft(fwd,D1,fftadvect)

    End Subroutine Advection

End Module NS

Program Main
  use FFTW3
  use NS
  integer :: kx
  ! Prep FFTW
  call dfftw_plan_dft_2d(fwd,ny,nx,fftomega,fftpsi,FFTW_FORWARD,FFTW_MEASURE)
  call dfftw_plan_dft_2d(bak,ny,nx,fftomega,fftpsi,FFTW_BACKWARD,FFTW_MEASURE) 
  call Make_Data
  do kx=1,100
     call Advection(fftpsi,fftomega)
     end do

End Program Main

使用openmp并行Fortran FFT

0 个答案: