我正在测试RcppParallel软件包以计算磁盘数据的内部产品(通过内存映射访问-与软件包bigmemory相似)。
“最小”复制示例:
length(x) <- value
输出:
v <- 1:20
everyOther(v) <- 7
v
# [1] 1 7 3 7 5 7 7 7 9 7 11 7 13 7 15 7 17 7 19 7
使用一个线程需要4秒钟,而使用2个线程则需要53秒。 我对可能造成这种巨大差异的原因有些困惑。有什么想法吗?
PS1:我已经在两台不同的计算机上运行了此程序(没有其他进程在运行)。
PS2:我知道我应该并行化// [[Rcpp::depends(RcppParallel, BH, bigstatsr)]]
#include <bigstatsr/BMCodeAcc.h>
#include <RcppParallel.h>
using namespace RcppParallel;
struct Sum : public Worker {
SubBMCode256Acc macc;
double xySum;
std::size_t j0, j;
// constructors
Sum(SubBMCode256Acc macc) :
macc(macc), xySum(0), j0(0), j(0) {}
Sum(const Sum& sum, std::size_t j0, std::size_t j) :
macc(sum.macc), xySum(0), j0(j0), j(j) {}
Sum(const Sum& sum, Split) :
macc(sum.macc), xySum(0), j0(sum.j0), j(sum.j) {}
// accumulate just the element of the range I've been asked to
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = begin; i < end; i++) {
xySum += macc(i, j) * macc(i, j0);
}
}
// join results
void join(const Sum& rhs) {
xySum += rhs.xySum;
}
};
// [[Rcpp::export]]
NumericVector parallelVectorSum(Environment BM) {
XPtr<FBM> xpBM = BM["address"];
std::size_t n = xpBM->nrow();
std::size_t m = xpBM->ncol();
SubBMCode256Acc macc(xpBM, seq_len(n) - 1, seq_len(m) - 1, BM["code256"]);
int grain = std::sqrt(n);
Sum sum0(macc);
NumericVector res(m);
for (size_t j = 0; j < m; j++) {
Sum sum(sum0, 0, j);
parallelReduce(0, n, sum, grain);
res[j] = sum.xySum;
}
return res;
}
/*** R
RcppParallel::setThreadOptions(2)
library(bigsnpr)
snp <- snp_attachExtdata()
G <- snp$genotypes
test0 <- parallelVectorSum(G)
G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))
dim(G2)
RcppParallel::setThreadOptions(1)
system.time(test1 <- parallelVectorSum(G2))
testthat::expect_identical(test1, 500 * test0)
RcppParallel::setThreadOptions(2)
system.time(test2 <- parallelVectorSum(G2))
testthat::expect_identical(test2, 500 * test0)
*/
。我已经测试过了它运作良好。但是,在我遇到的实际问题中,在> Rcpp::sourceCpp('tmp-tests/test-rcpp-parallel.cpp')
> RcppParallel::setThreadOptions(2)
> library(bigsnpr)
> snp <- snp_attachExtdata()
> G <- snp$genotypes
> test0 <- parallelVectorSum(G)
> G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))
> dim(G2)
[1] 258500 4542
> RcppParallel::setThreadOptions(1)
> system.time(test1 <- parallelVectorSum(G2)) # 100 / 3
user system elapsed
3.621 0.423 4.045
> testthat::expect_identical(test1, 500 * test0)
> RcppParallel::setThreadOptions(2)
> system.time(test2 <- parallelVectorSum(G2)) # 177 / 39
user system elapsed
39.958 42.590 53.516
> testthat::expect_identical(test2, 500 * test0)
上的迭代不是独立的,因此在j
上并行化要容易得多。