openmp和共享与私有

时间:2020-07-01 17:05:19

标签: fortran openmp gnu

我正在尝试使用openmp并行化我的代码,但是不确定什么是私有的,什么是共享的。我已经使用了很多MPI,但没有使用过openmpi。

现在,我正在使用gnu编译器。我在底部包含要用gfortran -fopenmp编译的程序的副本。该代码运行计算的串行版本,然后尝试并行化版本。

在这段代码中,我具有以下要并行化的嵌套嵌套do循环。这是我计算域上的一个循环,对于每个(ix,iz),我正在整个域上执行矢量化积分。您可能会认为这是Biot-Savart定律的数值积分,该积分在被积物中使用了格林函数。

这是我要并行化的循环

DO ix=1,nx
     DO iz=1,nz
        rpx = x(ix)-x2D
        rpz = z(iz)-z2D
        magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
        integrand1 = Jy*rpz/(magrp**2.0_num)
        integrand2 =-Jy*rpx/(magrp**2.0_num)
        integrand1(ix,iz) = 0.0_num
        integrand2(ix,iz) = 0.0_num
        bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
        bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
     END DO
  END DO
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))

其中x2D,y2D,magrp,integrand1,integrand2,Jy是每个Bx_BS(ix,iz)和Bz_BS(ix,iz)求和的二维数组。

我第一次尝试并行化看起来像

!$omp parallel default(shared) private(ix,iz,tid)
  tid = OMP_GET_THREAD_NUM()
  nthreads = OMP_GET_NUM_THREADS()
  seconds = omp_get_wtime()
!$omp do
  DO ix=1,nx
     DO iz=1,nz
        rpx = x(ix)-x2D
        rpz = z(iz)-z2D
        magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
        integrand1 = Jy*rpz/(magrp**2.0_num)
        integrand2 =-Jy*rpx/(magrp**2.0_num)
        integrand1(ix,iz) = 0.0_num
        integrand2(ix,iz) = 0.0_num
        bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
        bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
     END DO
  END DO
!$omp end do
!$omp end parallel
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))

但是,串行和并行集成的打印测试结果并不相同,此外,Bx_BS的值因执行不同而有所不同,因此出了点问题。

我假设必须共享线程必须读取的任何数组,并且目标数组(Bx_BS,Bz_BS)也应该共享,因此我将唯一的私有变量设置为嵌套的索引做循环,我认为openmp正在结束。

有人能发现我要去哪里错吗?

谢谢!

PROGRAM internal_2D

  USE omp_lib
  
  IMPLICIT NONE
  
  INTEGER, PARAMETER :: num=kind(1.0d0)
  INTEGER, PARAMETER :: nx=32, nz=32
  REAL(num), PARAMETER :: pi = 3.14159265358979323_num
  REAL(num) :: L, dx, dz, dS, wtube, seconds
  INTEGER :: ix, iz, m, out_unit, tid, nthreads
  REAL(num), DIMENSION(:), ALLOCATABLE :: x, z
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: x2D, z2D, r
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: bx, by, bz,jx, jy, jz, Ax, Ay, Az
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: rpx, rpy, rpz, magrp, integrand1, integrand2, integrand3
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: bx_BS, by_BS, bz_BS, Ax_BS, Ay_BS, Az_BS
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: curlBx_BS, curlBy_BS, curlBz_BS, curlAx_BS, curlAy_BS, curlAz_BS
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: temp1, temp2, divB, curlBx, curlBy, curlBz, curlAx, curlAy, curlAz
  REAL(num), DIMENSION(:,:), ALLOCATABLE :: del2Ay, weight, del2Ay_BS, curl_diff_B_BS

  ALLOCATE(x(1:nx), z(1:nz))
  ALLOCATE(x2D(1:nx,1:nz), z2D(1:nx,1:nz), r(1:nx,1:nz))
  ALLOCATE(jx(1:nx,1:nz), jy(1:nx,1:nz), jz(1:nx,1:nz))
  ALLOCATE(bx(1:nx,1:nz), by(1:nx,1:nz), bz(1:nx,1:nz))
  ALLOCATE(Ax(1:nx,1:nz), Ay(1:nx,1:nz), Az(1:nx,1:nz))
  ALLOCATE(temp1(1:nx,1:nz),temp2(1:nx,1:nz),divB(1:nx,1:nz))
  ALLOCATE(curlBx(1:nx,1:nz),curlBy(1:nx,1:nz),curlBz(1:nx,1:nz))
  ALLOCATE(curlAx(1:nx,1:nz),curlAy(1:nx,1:nz),curlAz(1:nx,1:nz))
  ALLOCATE(del2Ay(1:nx,1:nz),weight(1:nx,1:nz))
  ALLOCATE(magrp(1:nx,1:nz), integrand1(1:nx,1:nz), integrand2(1:nx,1:nz))
  ALLOCATE(bx_BS(1:nx,1:nz),by_BS(1:nx,1:nz),bz_BS(1:nx,1:nz))
  ALLOCATE(Ax_BS(1:nx,1:nz),Ay_BS(1:nx,1:nz),Az_BS(1:nx,1:nz))
  ALLOCATE(curlAx_BS(1:nx,1:nz),curlAz_BS(1:nx,1:nz),del2Ay_BS(1:nx,1:nz))
  ALLOCATE(curl_diff_B_BS(1:nx,1:nz))
  
  L = 1.0_num
  
  print*, '----------------------------------------------------------'
  PRINT*, 'Setting up arrays'
  DO ix=1,nx
     x(ix) = (-L/2.0_num) + L*DBLE(ix-1)/DBLE(nx-1)
  END DO
  dx = x(2)-x(1)
  DO iz=1,nz
     z(iz) = (-L/2.0_num) + L*DBLE(iz-1)/DBLE(nz-1)
  END DO
  dz = z(2)-z(1)
  dS = dx * dz
  DO ix=1,nx
     DO iz=1,nz
        x2D(ix,iz) = (-L/2.0_num) + L*DBLE(ix-1)/DBLE(nx-1)
        z2D(ix,iz) = (-L/2.0_num) + L*DBLE(iz-1)/DBLE(nz-1)
     END DO
  END DO

  print*, '----------------------------------------------------------'
  print*, 'defining J and B and A for test' 
  wtube = 0.5d0
  r = sqrt(x2d**2.0_num+z2d**2.0_num)
  bx = 0.0_num
  bz = 0.0_num
  jy = 0.0_num
  where (r .le. wtube)
     bx = z2d * ( (0.25_num*r**2.0_num/(1.0_num*wtube**2.0_num)) - &
                  (2.00_num*r**3.0_num/(5.0_num*wtube**3.0_num)) + &
                  (1.00_num*r**4.0_num/(6.0_num*wtube**4.0_num)) )
     bz =-x2d * ( (0.25_num*r**2.0_num/(1.0_num*wtube**2.0_num)) - &
                  (2.00_num*r**3.0_num/(5.0_num*wtube**3.0_num)) + &
                  (1.00_num*r**4.0_num/(6.0_num*wtube**4.0_num)) )
     jy = (1.0_num-(r/wtube))**2.0_num * (r/wtube)**2.0_num
  elsewhere
     bx = z2d*wtube**2.0_num / (60.0_num*r**2.0_num)
     bz =-x2d*wtube**2.0_num / (60.0_num*r**2.0_num)
  end where

  
  print*, '----------------------------------------------------------'
  print*, 'checking divB=0'
  temp1 = 0.0_num
  temp2 = 0.0_num
  call pdiv_2D(bx,temp1, nx, 1, dx)
  call pdiv_2D(bz,temp2, nz, 2, dz)
  divB = temp1+temp2
  print*, 'maxval(abs(div.B)) ', maxval(abs(divB))
  print*, 'total(abs(div.B)) ', sum(abs(divB))
  
  print*, '----------------------------------------------------------'
  print*, 'checking curlB=J'
  temp1 = 0.0_num
  temp2 = 0.0_num
  call pdiv_2D(bx,temp1, nx, 2, dz)
  call pdiv_2D(bz,temp2, nz, 1, dx)
  curlBy = temp1-temp2
  print*, 'maxval(abs(curlBy-Jy)) ', maxval(abs(curlBy-Jy))
  print*, 'total(abs(curlBy-Jy)) ', sum(abs(curlBy-jy))
  
  print*, '----------------------------------------------------------'
  print*, '2D vectorized B-S integration'
  weight = 4.0_num
  weight(1 ,1 ) = 1.0_num
  weight(1 ,nz) = 1.0_num
  weight(nx,1 ) = 1.0_num
  weight(nx,nz) = 1.0_num
  weight(2:nx-1,1 ) = 2.0_num
  weight(2:nx-1,nz) = 2.0_num
  weight(1 ,2:nz-1) = 2.0_num
  weight(nx,2:nz-1) = 2.0_num
  seconds = omp_get_wtime()
  DO ix=1,nx
     DO iz=1,nz
        rpx = x(ix)-x2D
        rpz = z(iz)-z2D
        magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
        integrand1 = Jy*rpz/(magrp**2.0_num)
        integrand2 =-Jy*rpx/(magrp**2.0_num)
        integrand1(ix,iz) = 0.0_num
        integrand2(ix,iz) = 0.0_num
        bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
        bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
        integrand2 =-Jy*log(magrp)
        integrand2(ix,iz) = 0.0_num
        Ay_BS(ix,iz) = dS*0.25*sum(weight*integrand2)/(2.0_num*pi)

     END DO
  END DO
  seconds = omp_get_wtime()-seconds
  write(*,*) '  Time for serial calc = ', seconds
  print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
  print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
  bx_BS = 0.0_num
  bz_BS = 0.0_num
  Ay_BS = 0.0_num
!$omp parallel default(shared) private(ix,iz,tid)
  tid = OMP_GET_THREAD_NUM()
  nthreads = OMP_GET_NUM_THREADS()
  seconds = omp_get_wtime()
!$omp do
  DO ix=1,nx
     DO iz=1,nz
        rpx = x(ix)-x2D
        rpz = z(iz)-z2D
        magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
        integrand1 = Jy*rpz/(magrp**2.0_num)
        integrand2 =-Jy*rpx/(magrp**2.0_num)
        integrand1(ix,iz) = 0.0_num
        integrand2(ix,iz) = 0.0_num
        bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
        bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
        integrand2 =-Jy*log(magrp)
        integrand2(ix,iz) = 0.0_num
        Ay_BS(ix,iz) = dS*0.25*sum(weight*integrand2)/(2.0_num*pi)

     END DO
  END DO
!$omp end do
!$omp end parallel
  seconds = omp_get_wtime()-seconds
  write(*,*) '  Time for parallel calc = ', seconds
  print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
  print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))

!!$
!!$  print*, '----------------------------------------------------------'
!!$  print*, 'checking curlA_BS vs B_BS'
!!$
!!$  print*, '----------------------------------------------------------'
!!$  print*, 'checking del2Ay vs curlB_BS'
!!$
!!$  print*, '----------------------------------------------------------'
!!$  print*, 'checking A_BS vs A'
!!$
!!$  print*, '----------------------------------------------------------'
!!$  print*, 'checking curlA_BS vs curlA'
!!$
!!$  print*, '----------------------------------------------------------'
!!$  print*, 'checking curlcurlA_BS vs curlcurlA'
!!$  
  print*, '----------------------------------------------------------'
  print*, 'checking B_BS vs B'
  print*, 'maxval(abs(Bx_BS-Bx)) ', maxval(abs(Bx_BS-Bx))
  print*, 'total(abs(Bx_BS-Bx)) ', sum(abs(Bx_BS-Bx))
  print*,  'sqrt(total(|Bx_BS-Bx|^2)/total(|Bx|^2) ', &
       sqrt(sum((Bx_BS-Bx)**2)/sum(Bx**2))
  print*, 'maxval(abs(Bz_BS-Bz)) ', maxval(abs(Bz_BS-Bz))
  print*, 'total(abs(Bz_BS-Bz)) ', sum(abs(Bz_BS-Bz))
  print*,  'sqrt(total(|Bz_BS-Bz|^2)/total(|Bz|^2) ', &
       sqrt(sum((Bz_BS-Bz)**2)/sum(Bz**2))
  
  print*, '----------------------------------------------------------'
  print*, 'checking curl(B-BS)'
  temp1 = 0.0_num
  temp2 = 0.0_num
  call pdiv_2D(bx_BS-bx,temp1, nx, 2, dz)
  call pdiv_2D(bz_BS-bz,temp2, nz, 1, dx)
  curl_diff_B_BS = temp1-temp2
  print*, 'maxval(abs(curl_diff_By_BS))', maxval(abs(curl_diff_B_BS))
  print*, 'total(abs(curl_diff_By_BS))', sum(abs(curl_diff_B_BS))
  
  print*, '----------------------------------------------------------'
  print*, 'checking curlB_BS vs curlB'
  temp1 = 0.0_num
  temp2 = 0.0_num
  call pdiv_2D(bx_BS,temp1, nx, 2, dz)
  call pdiv_2D(bz_BS,temp2, nz, 1, dx)
  curlBy_BS = temp1-temp2
  print*, 'maxval(abs(curlBy_BS-curlBy)) ', maxval(abs(curlBy_BS-curlBy))
  print*, 'total(abs(curlBy_BS-curlBy)) ', sum(abs(curlBy_BS-curlBy))
  print*,  'sqrt(total(|curlBy_BS-curlBy|^2)/total(|curlBy|^2) ', &
       sqrt(sum((curlBy_BS-curlBy)**2)/sum(curlBy**2))
 
  out_unit = 3
  open(out_unit, file="output_internal_2D.dat", status="replace", form="unformatted")
  write(out_unit) x
  write(out_unit) z
  write(out_unit) jy
  write(out_unit) bx
  write(out_unit) bz
  write(out_unit) curlBy
!!$  write(out_unit) Ay
!!$  write(out_unit) curlAx
!!$  write(out_unit) curlAz
!!$  write(out_unit) del2Ay
!!$  write(out_unit) Ay_BS
!!$  write(out_unit) curlAx_BS
!!$  write(out_unit) curlAz_BS
  write(out_unit) Bx_BS
  write(out_unit) Bz_BS
  write(out_unit) curlBy_BS
  write(out_unit) curl_diff_B_BS
  
  close(3)
  print*, '----------------------------------------------------------'

CONTAINS



  SUBROUTINE pdiv_2D(array_in, array_out, n, dir, res)
    
    INTEGER, INTENT(IN) :: dir, n
    REAL(num), INTENT(IN) :: array_in(n,n)
    REAL(num), INTENT(OUT) :: array_out(n,n)
    REAL(num), INTENT(IN) :: res
    INTEGER :: ix, iz
    REAL(num), DIMENSION(n) :: grad

    IF (dir .eq. 1) THEN
       DO iz=1,n
          CALL deriv(array_in(:,iz), array_out(:,iz), n, res)
       END DO
    ELSE IF (dir .eq. 2) THEN
       DO ix=1,n
          CALL deriv(array_in(ix,:), array_out(ix,:), n, res)
       END DO
    ENDIF

  END SUBROUTINE pdiv_2D



  
  SUBROUTINE deriv(a_in, a_out, n, res) ! one dimensional derivative - start with three point stencil - carfeul at ends!
    
    INTEGER, INTENT(IN) :: n
    REAL(num), INTENT(IN) :: a_in(n)
    REAL(num), INTENT(OUT) :: a_out(n)
    REAL(num), INTENT(IN) :: res
    INTEGER :: ix

    !3 point stencil - fix end points to be one directional
    !a_out = (cshift(a_in,1)-cshift(a_in,-1))/(2.0_num*res)    
    !a_out(1) = (1.0_num/res) * (-3.0_num*a_in(1) + 4.0_num*a_in(2  ) - a_in(3  )) / 2.0_num
    !a_out(n) = (1.0_num/res) * ( 3.0_num*a_in(n) - 4.0_num*a_in(n-1) + a_in(n-2)) / 2.0_num

    !d = (shift(x,-1) - shift(x,1))/2.D0
    !d[0] =  (-3.D0*x[0] + 4.D0*x[1] - x[2])/2.D0
    !d[n-1] = (3.D0*x[n-1] - 4.D0*x[n-2] + x[n-3])/2.D0

    !7 point stencil - fix end points to be one directional
    a_out = -(-cshift(a_in,3)+9.0_num*cshift(a_in,2)-45.0_num*cshift(a_in,1)+45.0_num*&
         cshift(a_in,-1)-9.0_num*cshift(a_in,-2)+cshift(a_in,-3))/(60.0_num*res)
    ix=1
    a_out(ix) =  (1.0_num/res) * (-147.0_num*a_in(ix+0)+360.0_num*a_in(ix+1)-450.0_num*&
         a_in(ix+2)+400.0_num*a_in(ix+3)-225.0_num*a_in(ix+4)+72.0_num*a_in(ix+5)-10.0_num*a_in(ix+6))/(60.0_num)
    ix = 2
    a_out(ix) =  (1.0_num/res) * (-10.0_num*a_in(ix-1)-77.0_num*a_in(ix+0)+150.0_num*&
         a_in(ix+1)-100.0_num*a_in(ix+2)+50.0_num*a_in(ix+3)-15.0_num*a_in(ix+4)+2.0_num*a_in(ix+5))/(60.0_num)
    ix = 3
    a_out(ix) =  (1.0_num/res) * (2.0_num*a_in(ix-2)-24.0_num*a_in(ix-1)-35.0_num*&
         a_in(ix+0)+80.0_num*a_in(ix+1)-30.0_num*a_in(ix+2)+8.0_num*a_in(ix+3)-1.0_num*a_in(ix+4))/(60.0_num)
    ix = n-2
    a_out(ix) =  (1.0_num/res) * (1.0_num*a_in(ix-4)-8.0_num*a_in(ix-3)+30.0_num*&
         a_in(ix-2)-80.0_num*a_in(ix-1)+35.0_num*a_in(ix+0)+24.0_num*a_in(ix+1)-2.0_num*a_in(ix+2))/(60.0_num)
    ix=n-1
    a_out(ix) =  (1.0_num/res) * (-2.0_num*a_in(ix-5)+15.0_num*a_in(ix-4)-50.0_num*&
         a_in(ix-3)+100.0_num*a_in(ix-2)-150.0_num*a_in(ix-1)+77.0_num*a_in(ix+0)+10.0_num*a_in(ix+1))/(60.0_num)
    ix=n
    a_out(ix)=   (1.0_num/res) * (10.0_num*a_in(ix-6)-72.0_num*a_in(ix-5)+225.0_num*&
         a_in(ix-4)-400.0_num*a_in(ix-3)+450.0_num*a_in(ix-2)-360.0_num*a_in(ix-1)+147.0_num*a_in(ix+0))/(60.0_num)
    
    !d = (-shift(x,3)+9.d0*shift(x,2)-45.d0*shift(x,1)+45.d0*shift(x,-1)-9.d0*shift(x,-2)+shift(x,-3))/60.D0
    !    I=0L
    !    d[I] =(-147.d0*x[i+0]+360.d0*x[i+1]-450.d0*x[i+2]+400.d0*x[i+3]-225.d0*x[i+4]+72.d0*x[i+5]-10.d0*x[i+6])/(60.d0)
    !    I=1L
    !    d[I] =(-10.d0*x[i-1]-77.d0*x[i+0]+150.d0*x[i+1]-100.d0*x[i+2]+50.d0*x[i+3]-15.d0*x[i+4]+2.d0*x[i+5])/(60.d0)
    !    I=2L
    !    d[I] =(2.d0*x[i-2]-24.d0*x[i-1]-35.d0*x[i+0]+80.d0*x[i+1]-30.d0*x[i+2]+8.d0*x[i+3]-1.d0*x[i+4])/(60.d0)
    !    I=N-3
    !    d[I]= (1.d0*x[i-4]-8.d0*x[i-3]+30.d0*x[i-2]-80.d0*x[i-1]+35.d0*x[i+0]+24.d0*x[i+1]-2.d0*x[i+2])/(60.d0)
    !    I=N-2
    !    d[I]=(-2.d0*x[i-5]+15.d0*x[i-4]-50.d0*x[i-3]+100.d0*x[i-2]-150.d0*x[i-1]+77.d0*x[i+0]+10.d0*x[i+1])/(60.d0)
    !    I=N-1
    !    d[I]= (10.d0*x[i-6]-72.d0*x[i-5]+225.d0*x[i-4]-400.d0*x[i-3]+450.d0*x[i-2]-360.d0*x[i-1]+147.d0*x[i+0])/(60.d0)
  END SUBROUTINE deriv


END PROGRAM internal_2D

0 个答案:

没有答案
相关问题