在R中重用现有内存用于BLAS操作

时间:2015-02-15 23:29:26

标签: r rcpp blas armadillo

我在R中的紧环内操作,我需要优化。它通过计算向量和矩阵的Schur乘积来更新IRLS算法内的权重。也就是说,它将矩阵中的每个元素乘以向量中的相应行值,从而产生与矩阵相同尺寸的结果。在过于简化的原理图形式中,它看起来像这样:

reweight = function(iter, w, Q) {
  for (i in 1:iter) {
    wT = w * Q
  }
}

在普通的R代码中,每次迭代都会创建一个新的dim()[rows,cols]矩阵:

cols = 1000
rows = 1000000
w = runif(rows)
Q = matrix(1.0, rows, cols)

Rprofmem()
reweight(5, w, Q)
Rprofmem(NULL)

nate @ ubuntu:〜/ R $ less Rprofmem.out

8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"

如果矩阵很大(多GB),内存分配的成本超过了数字操作所花费的时间:

nate @ ubuntu:〜/ R $ perf record -p`pgrep R` sleep 5&&报告

49.93%  R  [kernel.kallsyms]  [k] clear_page_c_e
47.67%  R  libR.so            [.] real_binary
 0.57%  R  [kernel.kallsyms]  [k] get_page_from_freelist
 0.35%  R  [kernel.kallsyms]  [k] clear_huge_page
 0.34%  R  libR.so            [.] RunGenCollect
 0.20%  R  [kernel.kallsyms]  [k] clear_page

它也消耗了大量内存:

USER       PID VSZ    RSS    COMMAND
nate     17099 22.5GB 22.5GB /usr/local/lib/R/bin/exec/R --vanilla

如果矩阵较小(几MB)但迭代次数较多,则内存使用率更合理,但代价是垃圾收集器使用的时间比数值计算更多:

cols = 100
rows = 10000
w = runif(rows)
Q = matrix(1.0, rows, cols)
reweight(1000, w, Q)

(请注意,这是一个从头开始的新流程)

61.51%  R  libR.so            [.] RunGenCollect
26.40%  R  libR.so            [.] real_binary
 7.94%  R  libR.so            [.] SortNodes
 2.79%  R  [kernel.kallsyms]  [k] clear_page_c_e

USER       PID VSZ    RSS    COMMAND
nate     17099 191MB  72MB /usr/local/lib/R/bin/exec/R --vanilla

如果我用Rcpp编写自己的函数来完成工作,我可以得到我想要的内存分配:

library(Rcpp)
cppFunction('
void weightMatrix(NumericVector w,
                  NumericMatrix Q,
                  NumericMatrix wQ) {
    size_t numRows = Q.rows();
    for (size_t row = 0; row < numRows; row++) {
       wQ(row,_) = w(row) * Q(row,_);  
    }
    return;
}
')

reweightCPP = function(iter, w, Q) {
  # Initialize workspace to non-NA
  wQ = matrix(1.0, nrow(Q), ncol(Q))
  for (i in 1:iter) {
    weightMatrix(w, Q, wQ)
  }
}

cols = 100
rows = 10000
w = runif(rows)
Q = matrix(1.0, rows, cols)
wQ = matrix(NA, rows, cols)
Rprofmem()
reweightCPP(5, w, Q)
Rprofmem(NULL)

nate @ ubuntu:〜/ R $ less Rprofmem.out

8000040 :"matrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"

(2544字节的分配是什么?似乎是一个Rcpp常数。有什么方法可以避免它吗?)

由于Rcpp糖,性能仍然不理想:

76.53%  R  sourceCpp_82335.so  [.] _Z12weightMatrixN4Rcpp6VectorILi14ENS_15PreserveStorageEEENS_6MatrixILi14ES1_EES4_
10.46%  R  libR.so             [.] Rf_getAttrib
 9.53%  R  libR.so             [.] getAttrib0
 2.06%  R  libR.so             [.] Rf_isMatrix
 0.42%  R  libR.so             [.] INTEGER

但我可以通过使用较低级别的C ++来解决这个问题:

cppFunction('
void weightMatrix(NumericVector w_,
                  NumericMatrix Q_,
                  NumericMatrix wQ_) {
    size_t numCols = Q_.ncol();
    size_t numRows = Q_.nrow();
    double * __restrict__ w = &w_[0];
    double * __restrict__ Q = &Q_[0];
    double * __restrict__ wQ = &wQ_[0];
    for (size_t row = 0; row < numRows; row++) {
        size_t colOffset = 0;
        for (size_t col = 0; col < numCols; col++) {
            wQ[colOffset + row] = w[row] * Q[colOffset + row];
            colOffset += numRows;
        }
    }
    return;
}
')

99.18%  R  sourceCpp_59392.so  [.] sourceCpp_48203_weightMatrix
 0.06%  R  libR.so             [.] PutRNGstate
 0.06%  R  libR.so             [.] do_begin
 0.06%  R  libR.so             [.] Rf_eval

尽管如此,我仍然没有想到让编译器可靠地生成有效的汇编而不需要使用SIMD内在函数来强制使用VMULPD。即使是丑陋的__限制__&#39;属性,在这里显示的形式似乎被迫反转我的循环顺序,并做了很多不必要的工作。但据推测,我最终会发现神奇的交叉编译器语法,或者更有可能,它会调用Fortran BLAS函数。

这让我想到了我的问题:

有没有什么方法可以获得我想要的性能而不会遇到所有这些麻烦?如果不这样做,我有什么方法可以至少隐藏在幕后,以便R中的最终用户可以使用&#34; wQ = w * Q&#34;并让它神奇地重用wQ而不是分配和丢弃另一个巨大的矩阵?

对于可以将答案写入其中一个操作数(Q = w * Q)的情况,R中的BLAS包装器似乎做得相当不错,但我还没有找到任何方法来执行此操作我需要一个&#34;第三方&#34;工作区。是否有一些合理的方法来定义%=%的方法,它将转换&#34; wQ = w * Q&#34; to&#34; op_mult(w,Q,wQ)&#34;?

先解决问题是否重要:是的,我已经衡量过,而且很重要。用例是处理大型纵向数据(http://cran.r-project.org/web/packages/ltmle/ltmle.pdf)的循环内的交叉验证逻辑回归的集合。每次分析将被称为数百万(如果不是数十亿)次。这个函数的一个很好的优化将有助于从不可能的&#34;中获得运行时间。到#34;天&#34;。一个伟大的优化(或者更确切地说是几个这样的优化的组合)可能会让它延长到&#34;小时&#34;甚至&#34;分钟&#34;。

编辑:在评论中,Henrik正确指出示例循环已经简化到只是重复多次相同的计算。我希望这会集中讨论这个问题,但也许它会让人感到困惑。在真实版本中,循环中会有更多步骤,以便&#39; w&#39;在&#39; w * Q&#39;每次迭代都不同。下面是一个测试不佳的实际功能草案版本。这是一个&#34;半优化&#34;基于O'Leary's QR Newton IRLS described by Bryan Lewis的直R的逻辑回归。

logistic_irls_qrnewton = function(A, y, maxIter=25, targetSSE=1e-16) {
    # warn user below on first weight less than threshold
    tinyWeightsFound = FALSE
    tiny = sqrt(.Machine$double.eps)

    # decompose A to QR (only once, Choleski done in loop)
    QR = qr(A)     # A[rows=samples, cols=covariates]
    Q  = qr.Q(QR)  # Q[rows, cols] (same dimensions as A)
    R  = qr.R(QR)  # R[cols, cols] (upper right triangular)

    # copying now prevents copying each time y is used as argument
    y = y + 0;     # y[rows]

    # first pass is outside loop since initial values are constant
    iter = 1
    t = (y - 0.5) * 4.0       # t[rows] = (y - m) * initial weight
    C = chol(crossprod(Q, Q)) # C[rows, rows]
    t = crossprod(Q,t)
    s = forwardsolve(t(C), t) # s[cols]
    s = backsolve(C, s))
    t = Q %*% s
    sse = crossprod(s)        # sum of squared errors
    print(as.vector(sse))
    converged = ifelse(sse < targetSSE, 1, 0)

    while (converged == 0 && iter < maxIter) {
        iter = iter + 1

        # only t is required as an input
        dim(t) = NULL     # matrix to vector to counteract crossprod
        e = exp(t)
        m = e / (e + 1)       # mu = exp(eta) / (1 + exp(eta))
        d = m / (e + 1)       # mu.eta = exp(eta) / (1 + exp(eta))^2
        w = d * d / (m - m^2) # W =  (1 / variance) = 1 / (mu * (1 - mu))
        if(tinyWeightsFound == FALSE && min(w) < tiny) {
            print("Tiny weights found")
            tinyWeightsFound = TRUE
        }
        t = crossprod(Q, w * (((y - m) / d) + t))
        C = chol(crossprod(Q, w * Q))
        n = forwardsolve(t(C), t)
        n = backsolve(C, n)
        t = Q %*% n
        sse = crossprod(n - s) # divergence from previous
        s = n # save divergence for difference from next
        print(as.vector(sse))
        if (sse < targetSSE) converged = iter
    }

    if (converged == 0) {
        print(paste("Failed to converge after", iter, "iterations"))
        print(paste("Final SSE was", sse))
    } else {
        print(paste("Convergence after iteration", iter))
    }

    coefficients = backsolve(R, crossprod(Q,t))
    dim(coefficients) = NULL # return as a vector
    coefficients
}

0 个答案:

没有答案