Question

我正在尝试在第三维上分解4D阵列，并使用MPI发送到每个节点。基本上，我正在计算三个笛卡儿方向中每一个中原子位置的矩阵Cpq的导数。 Cpq的大小为nat_sl x nat_sl，因此dCpqdR的大小为nat_sl x nat_sl x nat x 3.在一天结束时，对于s，i，我必须计算dCpqdR在特征向量转置之间的矩阵乘积Cpq和Cpq的特征向量如下：

temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdR(:, :, s, i), Cpq))

这很好，但事实证明，s和i的循环现在是我代码的缓慢部分。因为每个都可以独立完成，我希望我可以分解dCpqdR，并给每个任务它自己的s，我计算它的衍生物。也就是说，我希望任务1获得dCpqdR（：，：1,1），任务2获得dCpqdR（：，：，1,2）等。

我在某种意义上通过使用缓冲的send / recv调用来实现这一点。根节点分配临时数组，填充它，发送到相关节点，相关节点按照自己的意愿进行计算。这很好，但速度慢且内存效率低。理想情况下，我希望以更有效的方式分解它。

然而，合乎逻辑的做法是使用mpi_scatterv，但这是我开始遇到麻烦的地方，因为我无法弄清楚内存布局。到目前为止，我写过这篇文章：

    call mpi_type_create_subarray(4, (/ nat_sl, nat_sl, nat, 3 /), (/nat_sl, nat_sl, n_pairs(me_image+1), 3/),&
                                  (/0, 0, 0, 0/), mpi_order_fortran, mpi_double_precision, subarr_typ, ierr)
    call mpi_type_commit(subarr_typ, ierr)

    call mpi_scatterv(dCpqdR, n_pairs(me_image+1), f_displs, subarr_typ,&
                      my_dCpqdR, 3*nat_sl*3*nat_sl*3*n_pairs(me_image+1), subarr_typ,&
                      root_image, intra_image_comm, ierr)

我使用这个子程序计算了n_pairs：

subroutine mbdvdw_para_init_int_forces()
implicit none
integer :: p, s, i, counter, k, cpu_ind
integer :: num_unique_rpq, n_pairs_per_proc, cpu
real(dp) :: Rpq(3), Rpq_norm, current_val

num_pairs = nat
if(.not.allocated(f_cpu_id))        allocate(f_cpu_id(nat, 3))
n_pairs_per_proc = floor(dble(num_pairs)/nproc_image)
cpu = 0
n_pairs = 0

counter = 1
p = 1
do counter = 0, num_pairs-1, 1
    n_pairs(modulo(counter, nproc_image)+1) = n_pairs(modulo(counter, nproc_image)+1) + 1
    end do

do s = 1, nat, 1
    f_cpu_id(s) = cpu
    if((counter.lt.num_pairs)) then
        if(p.eq.n_pairs(cpu+1)) then
            cpu = cpu + 1
            p = 0
            end if
        end if
    p = p + 1
    end do

call mp_set_displs( n_pairs, f_displs, num_pairs, nproc_image)
f_displs = f_displs*nat_sl*nat_sl*3

end subroutine mbdvdw_para_init_int_forces

和矩阵乘法的完整方法是

subroutine mbdvdw_interacting_energy(energy, forcedR, forcedh, forcedV)
    implicit none
    real(dp), intent(out) :: energy
    real(dp), dimension(nat, 3), intent(out) :: forcedR
    real(dp), dimension(3,3), intent(out) :: forcedh
    real(dp), dimension(nat), intent(out) :: forcedV
    real(dp), dimension(3*nat_sl, 3*nat_sl) :: temp
    real(dp), dimension(:,:,:,:), allocatable :: my_dCpqdR
    integer :: num_negative, i_atom, s, i, j, counter
    integer, parameter :: eigs_check    = 200
    integer :: subarr_typ, ierr

    ! lapack work variables
    integer :: LWORK, errorflag
    real(dp) :: WORK((3*nat_sl)*(3+(3*nat_sl)/2)), eigenvalues(3*nat_sl)

    call start_clock('mbd_int_energy')
    call mp_sum(Cpq, intra_image_comm)
    eigenvalues = 0.0_DP
    forcedR = 0.0_DP
    energy = 0.0_DP
    num_negative = 0
    forcedV = 0.0_DP

    errorflag=0
    LWORK=3*nat_sl*(3+(3*nat_sl)/2)
    call DSYEV('V', 'U', 3*nat_sl, Cpq, 3*nat_sl, eigenvalues, WORK, LWORK, errorflag)

    if(errorflag.eq.0) then
        do i_atom=1, 3*nat_sl, 1
            !open (unit=eigs_check, file="eigs.tmp",action="write",status="unknown",position="append")
            !    write(eigs_check, *) eigenvalues(i_atom)
            !close(eigs_check)
            if(eigenvalues(i_atom).ge.0.0_DP) then
                energy = energy + dsqrt(eigenvalues(i_atom))
            else
                num_negative = num_negative + 1
            end if
        end do

        if(num_negative.ge.1) then
            write(stdout, '(3X," WARNING: Found ", I3, " Negative Eigenvalues.")'), num_negative
        end if
        else
    end if

    energy = energy*nat/nat_sl

    !!!!!!!!!!!!!!!!!!!!
    ! Forces below here. There's going to be some long parallelization business.
    !!!!!!!!!!!!!!!!!!!!

    call start_clock('mbd_int_forces')

    if(.not.allocated(my_dCpqdR)) allocate(my_dCpqdR(nat_sl, nat_sl, n_pairs(me_image+1), 3)), my_dCpqdR = 0.0_DP



    if(mbd_vdw_forces) then
        do s=1,nat,1
            if(me_image.eq.(f_cpu_id(s)+1)) then
                do i=1,3,1
                    temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(my_dCpqdR(:, :, counter, i), Cpq))
                    do j=1,3*nat_sl,1
                        if(eigenvalues(j).ge.0.0_DP) then
                            forcedR(s, i) = forcedR(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
                            end if
                        end do
                    end do
                    counter = counter + 1
                end if
            end do
        forcedR = forcedR*nat/nat_sl

        do s=1,3,1
            do i=1,3,1
                    temp  = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdh(:, :, s, i), Cpq))
                    do j=1,3*nat_sl,1
                        if(eigenvalues(j).ge.0.0_DP) then
                            forcedh(s, i) = forcedh(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
                            end if
                        end do
                end do
            end do
        forcedh = forcedh*nat/nat_sl
        call mp_sum(forcedR, intra_image_comm)
        call mp_sum(forcedh, intra_image_comm)
    end if
    call stop_clock('mbd_int_forces')

    call stop_clock('mbd_int_energy')
    return
end subroutine mbdvdw_interacting_energy

但是当跑步时，它抱怨

[MathBook Pro:58100] *** An error occurred in MPI_Type_create_subarray
[MathBook Pro:58100] *** reported by process [2560884737,2314885530279477248]
[MathBook Pro:58100] *** on communicator MPI_COMM_WORLD
[MathBook Pro:58100] *** MPI_ERR_ARG: invalid argument of some other kind
[MathBook Pro:58100] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[MathBook Pro:58100] ***    and potentially your MPI job)

所以出了问题，但我不知道是什么。我知道我的描述开始时有些稀疏，所以请告诉我需要哪些信息来帮助。

使用mpi_scatterv和4D fortran数组

0 个答案: