我有一个学校项目在hpc分布式系统上进行矩阵乘法。
我需要从并行IO系统读取矩阵并使用pblacs在许多计算节点(处理器)上并行执行矩阵乘法。必须使用MPI IO命令读取数据。我知道PBlacs使用块循环分布来执行乘法。
教授没有向我们提供有关MPI IO的更多信息,而且我无法在其上找到大量信息/资源。 具体来说,有没有办法以块循环方式从并行io系统中读取矩阵并轻松将其插入到pblacs pdgemm中?
非常感谢任何有用资源的指示。我的时间有点短暂,对这个项目缺乏方向感到沮丧。
答案 0 :(得分:13)
这实际上相对简单(如果您已经了解了一些关于blacs / scalapack和mpi-io的内容!)但即便如此,文档 - 甚至是在线 - 就像您发现的那样,有点差。
首先要了解的是MPI-IO,它允许您使用普通的MPI数据类型来指定文件的每个进程“视图”,然后只读取属于该视图的数据。在我们的中心,我们有关于并行IO的半天课程的幻灯片和源代码;前三分之一是关于MPI-IO的基础知识。幻灯片here和示例源代码here。
要知道的第二件事是MPI有一种内置的方法来创建“分布式阵列”数据类型,其中一种组合允许您布局块循环数据分布;我在回答这个问题时的一般性讨论:What is the difference between darray and subarray in mpi?。
这意味着如果您有一个包含大矩阵的二进制文件,您可以使用MPI_Type_create_darray
使用mpi-io读取它,并且它将以块循环方式由任务分发。那么这只是做blacs或scalapack调用的问题。列出了使用scalapack psgemv进行矩阵向量乘法而不是psgemm的示例程序
在我对计算科学堆栈交换中question的回答中。
为了让您了解这些部分如何组合在一起,以下是一个简单的程序,它读入包含矩阵的二进制文件(首先是方形矩阵N的大小,然后是N ^ 2个元素),然后计算使用scalapack(新)pssyevr
例程的特征值和向量。它结合了MPI-IO,darray和scalapack的东西。它在Fortran中,但函数调用在基于C语言中是相同的。
!
! Use MPI-IO to read a diagonal matrix distributed block-cyclically,
! use Scalapack to calculate its eigenvalues, and compare
! to expected results.
!
program darray
use mpi
implicit none
integer :: n, nb ! problem size and block size
integer :: myArows, myAcols ! size of local subset of global array
real :: p
real, dimension(:), allocatable :: myA, myZ
real, dimension(:), allocatable :: work
integer, dimension(:), allocatable :: iwork
real, dimension(:), allocatable :: eigenvals
real, dimension(:), allocatable :: expected
integer :: worksize, totwork, iworksize
integer, external :: numroc ! blacs routine
integer :: me, procs, icontxt, prow, pcol, myrow, mycol ! blacs data
integer :: info ! scalapack return value
integer, dimension(9) :: ides_a, ides_z ! scalapack array desc
integer :: clock
real :: calctime, iotime
character(len=128) :: filename
integer :: mpirank
integer :: ierr
integer, dimension(2) :: pdims, dims, distribs, dargs
integer :: infile
integer, dimension(MPI_STATUS_SIZE) :: mpistatus
integer :: darray
integer :: locsize, nelements
integer(kind=MPI_ADDRESS_KIND) :: lb, locextent
integer(kind=MPI_OFFSET_KIND) :: disp
integer :: nargs
integer :: m, nz
! Initialize MPI (for MPI-IO)
call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD,procs,ierr)
call MPI_Comm_rank(MPI_COMM_WORLD,mpirank,ierr)
! May as well get the process grid from MPI_Dims_create
pdims = 0
call MPI_Dims_create(procs, 2, pdims, ierr)
prow = pdims(1)
pcol = pdims(2)
! get filename
nargs = command_argument_count()
if (nargs /= 1) then
print *,'Usage: darray filename'
print *,' Where filename = name of binary matrix file.'
call MPI_Abort(MPI_COMM_WORLD,1,ierr)
endif
call get_command_argument(1, filename)
! find the size of the array we'll be using
call tick(clock)
call MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr)
call MPI_File_read_all(infile,n,1,MPI_INTEGER,mpistatus,ierr)
call MPI_File_close(infile,ierr)
! create the darray that will read in the data.
nb = 64
if (nb > (N/prow)) nb = N/prow
if (nb > (N/pcol)) nb = N/pcol
dims = [n,n]
distribs = [MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC]
dargs = [nb, nb]
call MPI_Type_create_darray(procs, mpirank, 2, dims, distribs, dargs, &
pdims, MPI_ORDER_FORTRAN, MPI_REAL, darray, ierr)
call MPI_Type_commit(darray,ierr)
call MPI_Type_size(darray, locsize, ierr)
nelements = locsize/4
call MPI_Type_get_extent(darray, lb, locextent, ierr)
! Initialize local arrays
allocate(myA(nelements))
allocate(myZ(nelements))
allocate(eigenvals(n), expected(n))
! read in the data
call MPI_File_open(MPI_COMM_WORLD, trim(filename), MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr)
disp = 4 ! skip N = 4 bytes
call MPI_File_set_view(infile, disp, MPI_REAL, darray, "native", MPI_INFO_NULL, ierr)
call MPI_File_read_all(infile, myA, nelements, MPI_REAL, mpistatus, ierr)
call MPI_File_close(infile,ierr)
iotime = tock(clock)
if (mpirank == 0) then
print *,'I/O time = ', iotime
endif
! Initialize blacs processor grid
call tick(clock)
call blacs_pinfo (me,procs)
call blacs_get (-1, 0, icontxt)
call blacs_gridinit(icontxt, 'R', prow, pcol)
call blacs_gridinfo(icontxt, prow, pcol, myrow, mycol)
myArows = numroc(n, nb, myrow, 0, prow)
myAcols = numroc(n, nb, mycol, 0, pcol)
! Construct local arrays
! Global structure: matrix A of n rows and n columns
! Prepare array descriptors for ScaLAPACK
call descinit( ides_a, n, n, nb, nb, 0, 0, icontxt, myArows, info )
call descinit( ides_z, n, n, nb, nb, 0, 0, icontxt, myArows, info )
! Call ScaLAPACK library routine
allocate(work(1), iwork(1))
iwork(1) = -1
work(1) = -1.
call pssyevr( 'V', 'A', 'U', n, myA, 1, 1, ides_a, -1.e20, +1.e20, 1, n, &
m, nz, eigenvals, myZ, 1, 1, ides_z, work, -1, &
iwork, -1, info )
worksize = int(work(1))/2*3
iworksize = iwork(1)/2*3
print *, 'Local workspace ', worksize
call MPI_Reduce(worksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
if (mpirank == 0) print *, ' total work space ', totwork
call MPI_Reduce(iworksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr)
if (mpirank == 0) print *, ' total iwork space ', totwork
deallocate(work,iwork)
allocate(work(worksize),iwork(iworksize))
call pssyev('N','U',n,myA,1,1,ides_a,eigenvals,myZ,1,1,ides_z,work,worksize,info)
if (info /= 0) then
print *, 'Error: info = ', info
else if (mpirank == 0) then
print *, 'Calculated ', m, ' eigenvalues and ', nz, ' eigenvectors.'
endif
! Deallocate the local arrays
deallocate(myA, myZ)
deallocate(work, iwork)
! End blacs for processors that are used
call blacs_gridexit(icontxt)
calctime = tock(clock)
! calculated the expected eigenvalues for a particular test matrix
p = 3. + sqrt((4. * n - 3.) * (n - 1.)*3./(n+1.))
expected(1) = p/(n*(5.-2.*n))
expected(2) = 6./(p*(n + 1.))
expected(3:n) = 1.
! Print results
if (me == 0) then
if (info /= 0) then
print *, 'Error -- info = ', info
endif
print *,'Eigenvalues L_infty err = ', &
maxval(abs(eigenvals-expected))
print *,'Compute time = ', calctime
endif
deallocate(eigenvals, expected)
call MPI_Finalize(ierr)
contains
subroutine tick(t)
integer, intent(OUT) :: t
call system_clock(t)
end subroutine tick
! returns time in seconds from now to time described by t
real function tock(t)
integer, intent(in) :: t
integer :: now, clock_rate
call system_clock(now,clock_rate)
tock = real(now - t)/real(clock_rate)
end function tock
end program darray