我的Mac的R与openblas相关联。在使用Armadillo 在R或Rcpp中执行稀疏稀疏乘法时,当我查看“%CPU”使用率时,似乎不像在使用稠密乘法那样使用多线程。在速度方面,R或Armadillo中的单线程稀疏稀疏乘法似乎也比Matlab慢。
为解决此问题,我实现了FG Gustavson算法(https://dl.acm.org/citation.cfm?id=355796),该算法使用Armadillo的spMat容器在Rcpp中执行稀疏矩阵乘法。
如果我忽略对行的排序(这是算法的直接实现),我会看到一种改进(请参见下文),但是标准排序使其比R的速度慢(根据mtall的注释进行了编辑。 )。我不是Rcpp / RcppArmadillo / C ++的专家,我在两件事上寻求帮助:
如何通过编程使基于单线程应用程序的 sp_sp_gc_ord 功能更高效,更快捷?
我la脚的尝试使用 openmp 对 sp_sp_gc_ord 进行多线程处理导致R崩溃。我已经在下面注释了 omp 命令。我已经看过有关OpenMP http://gallery.rcpp.org/tags/openmp/的Rcpp画廊的讨论,但找不到问题
我将不胜感激。下面是该代码和相应的微基准的可复制示例:
char *
R中相应的微基准测试部分:
#### Rcpp functions
#include <RcppArmadillo.h>
#include<omp.h>
#include<Rcpp.h>
using namespace Rcpp;
using namespace arma;
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
sp_mat sp_sp_gc_ord(const arma::sp_mat &A, const arma::sp_mat &B, double p){
// This function evaluates A * B where both A & B are sparse and the resultant
// product is also sparse
// define matrix sizes
const int mA= A.n_rows;
const int nB= B.n_cols;
// number of non-zeros in the resultant matrix
const int nnzC = ceil(mA * nB * p);
// initialize colptr, row_index and value vectors for the resultant sparse matrix
urowvec colptrC(nB+1);
colptrC.zeros();
uvec rowvalC(nnzC);
rowvalC.zeros();
colvec nzvalC(nnzC);
//setenv("OMP_STACKSIZE","500M",1);
// counters and other variables
unsigned int i, jp, j, kp, k, vp;
unsigned int ip = 0;
double nzB, nzA;
ivec xb(mA);
xb.fill(-1);
vec x(mA);
// loop logic: outer loop over columns of B and inner loop over columns of A and then aggregate
// #pragma omp parallel for shared(colptrC,rowvalC,nzvalC,x,xb,ip,A,B) private(j,nzA,nzB,kp,i,jp,kp,k,vp) default(none) schedule(auto)
for(i=0; i< nB; i++) {
colptrC.at(i) = ip;
for ( jp = B.col_ptrs[i]; jp < B.col_ptrs[i+1]; jp++) {
j = B.row_indices[jp];
nzB = B.values[jp];
for ( kp = A.col_ptrs[j]; kp < A.col_ptrs[j+1]; kp++ ){
k = A.row_indices[kp];
nzA = A.values[kp];
if (xb.at(k) != i){
rowvalC.at(ip) = k;
ip +=1;
// Rcpp::print(wrap(ip));
xb.at(k) = i;
x.at(k) = nzA * nzB;
} else {
x.at(k) += nzA * nzB;
}
}
}
// put in the value vector of resultant matrix
if(ip>0){
for ( vp= colptrC.at(i); vp <= (ip-1); vp++ ) {
nzvalC.at(vp) = x(rowvalC.at(vp));
}
}
}
// resize and put in the spMat container
colptrC.at(nB) = ip;
sp_mat C(rowvalC.subvec(0,(ip-1)),colptrC,nzvalC.subvec(0,(ip-1)),mA,nB);
// Gustavson's algorithm produces unordered rows for each column: a standard way to address this is: (X.t()).t()
return (C.t()).t();
}
// [[Rcpp::export]]
sp_mat sp_sp_arma(const sp_mat &A, const sp_mat &B){
return A * B;
}
// [[Rcpp::export]]
mat dense_dense_arma(const mat &A, const mat &B){
return A * B;
}
#### End
sessionInfo():
#### Microbenchmark
library(Matrix)
library(microbenchmark)
## define two matrices
m<- 1000
n<- 6000
p<- 2000
A<- matrix(runif(m*n),m,n)
B<- matrix(runif(n*p),n,p)
A[abs(A)> .01] = B[abs(B)> .01] = 0
A <- as(A,'dgCMatrix')
B<- as(B,'dgCMatrix')
Adense<- as.matrix(A)
Bdense<- as.matrix(B)
## sp_sp_gc is the function without ordering
microbenchmark(sp_sp_gc(A,B,.5),sp_sp_arma(A,B),A%*%B,
dense_dense_arma(Adense,Bdense),Adense %*% Bdense,Adense %*% B, times=100)
Unit: milliseconds
expr min lq mean median uq max neval
sp_sp_gc(A, B, 0.5) 16.09809 21.75001 25.76436 24.44657 26.96300 99.30778 100
sp_sp_gc_ord(A, B, 0.5) 36.78781 44.64558 49.82102 47.64348 51.87361 116.85013 100
sp_sp_arma(A, B) 47.45203 52.77132 59.37077 59.24010 62.41710 86.15647 100
A %*% B 23.64307 28.99649 32.88566 32.10017 35.21816 59.16251 100
dense_dense_arma(Adense, Bdense) 286.22358 302.95170 345.66766 317.75786 340.50143 862.15116 100
Adense %*% Bdense 292.32099 317.10795 342.48345 329.80950 342.21333 697.56468 100
Adense %*% B 167.87248 186.63499 219.11872 195.19197 212.50286 843.17172 100
####
之后,为Mac安装了clang4之后,从源代码安装了Rcpp和RcppArmadillo。