我有一个scipy.sparse.csr_matrix格式的大型稀疏矩阵X,我想用一个利用并行性的numpy数组W来乘以它。经过一些研究后,我发现我需要在多处理中使用Array,以避免在进程之间复制X和W(例如:How to combine Pool.map with Array (shared memory) in Python multiprocessing?和Is shared readonly data copied to different processes for Python multiprocessing?)。这是我最近的尝试
import multiprocessing
import numpy
import scipy.sparse
import time
def initProcess(data, indices, indptr, shape, Warr, Wshp):
global XData
global XIndices
global XIntptr
global Xshape
XData = data
XIndices = indices
XIntptr = indptr
Xshape = shape
global WArray
global WShape
WArray = Warr
WShape = Wshp
def dot2(args):
rowInds, i = args
global XData
global XIndices
global XIntptr
global Xshape
data = numpy.frombuffer(XData, dtype=numpy.float)
indices = numpy.frombuffer(XIndices, dtype=numpy.int32)
indptr = numpy.frombuffer(XIntptr, dtype=numpy.int32)
Xr = scipy.sparse.csr_matrix((data, indices, indptr), shape=Xshape)
global WArray
global WShape
W = numpy.frombuffer(WArray, dtype=numpy.float).reshape(WShape)
return Xr[rowInds[i]:rowInds[i+1], :].dot(W)
def getMatmat(X):
numJobs = multiprocessing.cpu_count()
rowInds = numpy.array(numpy.linspace(0, X.shape[0], numJobs+1), numpy.int)
#Store the data in X as RawArray objects so we can share it amoung processes
XData = multiprocessing.RawArray("d", X.data)
XIndices = multiprocessing.RawArray("i", X.indices)
XIndptr = multiprocessing.RawArray("i", X.indptr)
def matmat(W):
WArray = multiprocessing.RawArray("d", W.flatten())
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(), initializer=initProcess, initargs=(XData, XIndices, XIndptr, X.shape, WArray, W.shape))
params = []
for i in range(numJobs):
params.append((rowInds, i))
iterator = pool.map(dot2, params)
P = numpy.zeros((X.shape[0], W.shape[1]))
for i in range(numJobs):
P[rowInds[i]:rowInds[i+1], :] = iterator[i]
return P
return matmat
if __name__ == '__main__':
#Create a random sparse matrix X and a random dense one W
X = scipy.sparse.rand(10000, 8000, 0.1)
X = X.tocsr()
W = numpy.random.rand(8000, 20)
startTime = time.time()
A = getMatmat(X)(W)
parallelTime = time.time()-startTime
startTime = time.time()
B = X.dot(W)
nonParallelTime = time.time()-startTime
print(parallelTime, nonParallelTime)
然而,输出类似于:(4.431,0.165),表明并行版本比非并行乘法慢得多。
我认为当人们将大数据复制到进程时类似的情况会导致减速,但这不是这种情况,因为我使用Array来存储共享变量(除非它发生在numpy.frombuffer或创建时)一个csr_matrix,但后来我找不到直接共享csr_matrix的方法。速度慢的另一个可能原因是每个过程返回每个矩阵乘法的大结果,但是我不确定是否可以解决这个问题。
有人能看到我哪里错了吗? 谢谢你的帮助!
更新:我无法确定,但我认为在进程之间共享大量数据并不是那么高效,理想情况下我应该使用多线程(尽管全局解释器锁(GIL)非常难以实现)。解决这个问题的一种方法是使用Cython释放GIL(参见http://docs.cython.org/src/userguide/parallelism.html),尽管许多numpy函数需要通过GIL。
答案 0 :(得分:1)
你最好的选择是用Cython下降到C.这样你就可以击败GIL并使用OpenMP。我对多处理速度较慢并不感到惊讶 - 那里有很多开销。
这是CSparse的稀疏矩阵的初始包装OpenMP包装器 - python中的矢量产品代码。
在我的笔记本电脑上,这比scipy快一点。但我没有那么多核心。代码,包括setup.py脚本和C头文件和内容都在这个要点:https://gist.github.com/rmcgibbo/6019670
我怀疑如果你真的想要快速并行代码(在我的笔记本电脑上,它比单线程scipy快约20%,即使使用4个线程),你需要更仔细地考虑并行性的位置比我发生的更多,注意缓存局部性。
# psparse.pyx
#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
cimport cython
cimport numpy as np
import numpy as np
import scipy.sparse
from libc.stddef cimport ptrdiff_t
from cython.parallel import parallel, prange
#-----------------------------------------------------------------------------
# Headers
#-----------------------------------------------------------------------------
ctypedef int csi
ctypedef struct cs:
# matrix in compressed-column or triplet form
csi nzmax # maximum number of entries
csi m # number of rows
csi n # number of columns
csi *p # column pointers (size n+1) or col indices (size nzmax)
csi *i # row indices, size nzmax
double *x # numerical values, size nzmax
csi nz # # of entries in triplet matrix, -1 for compressed-col
cdef extern csi cs_gaxpy (cs *A, double *x, double *y) nogil
cdef extern csi cs_print (cs *A, csi brief) nogil
assert sizeof(csi) == 4
#-----------------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------------
@cython.boundscheck(False)
def pmultiply(X not None, np.ndarray[ndim=2, mode='fortran', dtype=np.float64_t] W not None):
"""Multiply a sparse CSC matrix by a dense matrix
Parameters
----------
X : scipy.sparse.csc_matrix
A sparse matrix, of size N x M
W : np.ndarray[dtype=float564, ndim=2, mode='fortran']
A dense matrix, of size M x P. Note, W must be contiguous and in
fortran (column-major) order. You can ensure this using
numpy's `asfortranarray` function.
Returns
-------
A : np.ndarray[dtype=float64, ndim=2, mode='fortran']
A dense matrix, of size N x P, the result of multiplying X by W.
Notes
-----
This function is parallelized over the columns of W using OpenMP. You
can control the number of threads at runtime using the OMP_NUM_THREADS
environment variable. The internal sparse matrix code is from CSPARSE,
a Concise Sparse matrix package. Copyright (c) 2006, Timothy A. Davis.
http://www.cise.ufl.edu/research/sparse/CSparse, licensed under the
GNU LGPL v2.1+.
References
----------
.. [1] Davis, Timothy A., "Direct Methods for Sparse Linear Systems
(Fundamentals of Algorithms 2)," SIAM Press, 2006. ISBN: 0898716136
"""
if X.shape[1] != W.shape[0]:
raise ValueError('matrices are not aligned')
cdef int i
cdef cs csX
cdef np.ndarray[double, ndim=2, mode='fortran'] result
cdef np.ndarray[csi, ndim=1, mode = 'c'] indptr = X.indptr
cdef np.ndarray[csi, ndim=1, mode = 'c'] indices = X.indices
cdef np.ndarray[double, ndim=1, mode = 'c'] data = X.data
# Pack the scipy data into the CSparse struct. This is just copying some
# pointers.
csX.nzmax = X.data.shape[0]
csX.m = X.shape[0]
csX.n = X.shape[1]
csX.p = &indptr[0]
csX.i = &indices[0]
csX.x = &data[0]
csX.nz = -1 # to indicate CSC format
result = np.zeros((X.shape[0], W.shape[1]), order='F', dtype=np.double)
for i in prange(W.shape[1], nogil=True):
# X is in fortran format, so we can get quick access to each of its
# columns
cs_gaxpy(&csX, &W[0, i], &result[0, i])
return result
它从CSparse调用一些C.
// src/cs_gaxpy.c
#include "cs.h"
/* y = A*x+y */
csi cs_gaxpy (const cs *A, const double *x, double *y)
{
csi p, j, n, *Ap, *Ai ;
double *Ax ;
if (!CS_CSC (A) || !x || !y) return (0) ; /* check inputs */
n = A->n ; Ap = A->p ; Ai = A->i ; Ax = A->x ;
for (j = 0 ; j < n ; j++)
{
for (p = Ap [j] ; p < Ap [j+1] ; p++)
{
y [Ai [p]] += Ax [p] * x [j] ;
}
}
return (1) ;
}
答案 1 :(得分:1)
回复可能有点迟了。通过使用pyTrilinos软件包可以获得可靠的并行加速,该软件包为Trilinos中的许多功能提供python包装器。以下是您转换为使用pyTrilinos的示例:
from PyTrilinos import Epetra
from scipy.sparse import rand
import numpy as np
n_rows = 10000
n_cols = 8000
n_vecs = 20
fill_factor = 0.1
comm = Epetra.PyComm()
my_id = comm.MyPID()
row_map = Epetra.Map(n_rows, 0, comm)
out_vec_map = row_map
in_vec_map = Epetra.Map(n_cols, 0, comm)
col_map = Epetra.Map(n_cols, range(n_cols), 0, comm)
n_local_rows = row_map.NumMyElements()
# Create local block matrix in scipy and convert to Epetra
X = rand(n_local_rows, n_cols, fill_factor).tocoo()
A = Epetra.CrsMatrix(Epetra.Copy, row_map, col_map, int(fill_factor*n_cols*1.2), True)
A.InsertMyValues(X.row, X.col, X.data)
A.FillComplete()
# Create sub-vectors in numpy and convert to Epetra format
W = np.random.rand(in_vec_map.NumMyElements(), n_vecs)
V = Epetra.MultiVector(in_vec_map, n_vecs)
V[:] = W.T # order of indices is opposite
B = Epetra.MultiVector(out_vec_map, n_vecs)
# Multiply
A.Multiply(False, V, B)
然后,您可以使用MPI
运行此代码mpiexec -n 2 python scipy_to_trilinos.py
PyTrilinos的其他示例可以在github存储库here上找到。当然,如果使用pyTrilinos,这种使用scipy初始化矩阵的方式可能不是最优的。