使用mpi_scatterv和4D fortran数组

时间:2015-04-04 13:55:26

标签: fortran mpi openmpi

我正在尝试在第三维上分解4D阵列,并使用MPI发送到每个节点。基本上,我正在计算三个笛卡儿方向中每一个中原子位置的矩阵Cpq的导数。 Cpq的大小为nat_sl x nat_sl,因此dCpqdR的大小为nat_sl x nat_sl x nat x 3.在一天结束时,对于s,i,我必须计算dCpqdR在特征向量转置之间的矩阵乘积Cpq和Cpq的特征向量如下:

temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdR(:, :, s, i), Cpq))

这很好,但事实证明,s和i的循环现在是我代码的缓慢部分。因为每个都可以独立完成,我希望我可以分解dCpqdR,并给每个任务它自己的s,我计算它的衍生物。也就是说,我希望任务1获得dCpqdR(:,:1,1),任务2获得dCpqdR(:,:,1,2)等。

我在某种意义上通过使用缓冲的send / recv调用来实现这一点。根节点分配临时数组,填充它,发送到相关节点,相关节点按照自己的意愿进行计算。这很好,但速度慢且内存效率低。理想情况下,我希望以更有效的方式分解它。

然而,合乎逻辑的做法是使用mpi_scatterv,但这是我开始遇到麻烦的地方,因为我无法弄清楚内存布局。到目前为止,我写过这篇文章:

    call mpi_type_create_subarray(4, (/ nat_sl, nat_sl, nat, 3 /), (/nat_sl, nat_sl, n_pairs(me_image+1), 3/),&
                                  (/0, 0, 0, 0/), mpi_order_fortran, mpi_double_precision, subarr_typ, ierr)
    call mpi_type_commit(subarr_typ, ierr)

    call mpi_scatterv(dCpqdR, n_pairs(me_image+1), f_displs, subarr_typ,&
                      my_dCpqdR, 3*nat_sl*3*nat_sl*3*n_pairs(me_image+1), subarr_typ,&
                      root_image, intra_image_comm, ierr)

我使用这个子程序计算了n_pairs:

subroutine mbdvdw_para_init_int_forces()
implicit none
integer :: p, s, i, counter, k, cpu_ind
integer :: num_unique_rpq, n_pairs_per_proc, cpu
real(dp) :: Rpq(3), Rpq_norm, current_val

num_pairs = nat
if(.not.allocated(f_cpu_id))        allocate(f_cpu_id(nat, 3))
n_pairs_per_proc = floor(dble(num_pairs)/nproc_image)
cpu = 0
n_pairs = 0

counter = 1
p = 1
do counter = 0, num_pairs-1, 1
    n_pairs(modulo(counter, nproc_image)+1) = n_pairs(modulo(counter, nproc_image)+1) + 1
    end do

do s = 1, nat, 1
    f_cpu_id(s) = cpu
    if((counter.lt.num_pairs)) then
        if(p.eq.n_pairs(cpu+1)) then
            cpu = cpu + 1
            p = 0
            end if
        end if
    p = p + 1
    end do

call mp_set_displs( n_pairs, f_displs, num_pairs, nproc_image)
f_displs = f_displs*nat_sl*nat_sl*3

end subroutine mbdvdw_para_init_int_forces

和矩阵乘法的完整方法是

subroutine mbdvdw_interacting_energy(energy, forcedR, forcedh, forcedV)
    implicit none
    real(dp), intent(out) :: energy
    real(dp), dimension(nat, 3), intent(out) :: forcedR
    real(dp), dimension(3,3), intent(out) :: forcedh
    real(dp), dimension(nat), intent(out) :: forcedV
    real(dp), dimension(3*nat_sl, 3*nat_sl) :: temp
    real(dp), dimension(:,:,:,:), allocatable :: my_dCpqdR
    integer :: num_negative, i_atom, s, i, j, counter
    integer, parameter :: eigs_check    = 200
    integer :: subarr_typ, ierr

    ! lapack work variables
    integer :: LWORK, errorflag
    real(dp) :: WORK((3*nat_sl)*(3+(3*nat_sl)/2)), eigenvalues(3*nat_sl)

    call start_clock('mbd_int_energy')
    call mp_sum(Cpq, intra_image_comm)
    eigenvalues = 0.0_DP
    forcedR = 0.0_DP
    energy = 0.0_DP
    num_negative = 0
    forcedV = 0.0_DP

    errorflag=0
    LWORK=3*nat_sl*(3+(3*nat_sl)/2)
    call DSYEV('V', 'U', 3*nat_sl, Cpq, 3*nat_sl, eigenvalues, WORK, LWORK, errorflag)

    if(errorflag.eq.0) then
        do i_atom=1, 3*nat_sl, 1
            !open (unit=eigs_check, file="eigs.tmp",action="write",status="unknown",position="append")
            !    write(eigs_check, *) eigenvalues(i_atom)
            !close(eigs_check)
            if(eigenvalues(i_atom).ge.0.0_DP) then
                energy = energy + dsqrt(eigenvalues(i_atom))
            else
                num_negative = num_negative + 1
            end if
        end do

        if(num_negative.ge.1) then
            write(stdout, '(3X," WARNING: Found ", I3, " Negative Eigenvalues.")'), num_negative
        end if
        else
    end if

    energy = energy*nat/nat_sl

    !!!!!!!!!!!!!!!!!!!!
    ! Forces below here. There's going to be some long parallelization business.
    !!!!!!!!!!!!!!!!!!!!

    call start_clock('mbd_int_forces')

    if(.not.allocated(my_dCpqdR)) allocate(my_dCpqdR(nat_sl, nat_sl, n_pairs(me_image+1), 3)), my_dCpqdR = 0.0_DP



    if(mbd_vdw_forces) then
        do s=1,nat,1
            if(me_image.eq.(f_cpu_id(s)+1)) then
                do i=1,3,1
                    temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(my_dCpqdR(:, :, counter, i), Cpq))
                    do j=1,3*nat_sl,1
                        if(eigenvalues(j).ge.0.0_DP) then
                            forcedR(s, i) = forcedR(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
                            end if
                        end do
                    end do
                    counter = counter + 1
                end if
            end do
        forcedR = forcedR*nat/nat_sl

        do s=1,3,1
            do i=1,3,1
                    temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdh(:, :, s, i), Cpq))
                    do j=1,3*nat_sl,1
                        if(eigenvalues(j).ge.0.0_DP) then
                            forcedh(s, i) = forcedh(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
                            end if
                        end do
                end do
            end do
        forcedh = forcedh*nat/nat_sl
        call mp_sum(forcedR, intra_image_comm)
        call mp_sum(forcedh, intra_image_comm)
    end if
    call stop_clock('mbd_int_forces')

    call stop_clock('mbd_int_energy')
    return
end subroutine mbdvdw_interacting_energy

但是当跑步时,它抱怨

[MathBook Pro:58100] *** An error occurred in MPI_Type_create_subarray
[MathBook Pro:58100] *** reported by process [2560884737,2314885530279477248]
[MathBook Pro:58100] *** on communicator MPI_COMM_WORLD
[MathBook Pro:58100] *** MPI_ERR_ARG: invalid argument of some other kind
[MathBook Pro:58100] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[MathBook Pro:58100] ***    and potentially your MPI job)

所以出了问题,但我不知道是什么。我知道我的描述开始时有些稀疏,所以请告诉我需要哪些信息来帮助。

0 个答案:

没有答案