我在R中的紧环内操作,我需要优化。它通过计算向量和矩阵的Schur乘积来更新IRLS算法内的权重。也就是说,它将矩阵中的每个元素乘以向量中的相应行值,从而产生与矩阵相同尺寸的结果。在过于简化的原理图形式中,它看起来像这样:
reweight = function(iter, w, Q) {
for (i in 1:iter) {
wT = w * Q
}
}
在普通的R代码中,每次迭代都会创建一个新的dim()[rows,cols]矩阵:
cols = 1000
rows = 1000000
w = runif(rows)
Q = matrix(1.0, rows, cols)
Rprofmem()
reweight(5, w, Q)
Rprofmem(NULL)
nate @ ubuntu:〜/ R $ less Rprofmem.out
8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"
8000000040 :"reweight"
如果矩阵很大(多GB),内存分配的成本超过了数字操作所花费的时间:
nate @ ubuntu:〜/ R $ perf record -p`pgrep R` sleep 5&&报告
49.93% R [kernel.kallsyms] [k] clear_page_c_e
47.67% R libR.so [.] real_binary
0.57% R [kernel.kallsyms] [k] get_page_from_freelist
0.35% R [kernel.kallsyms] [k] clear_huge_page
0.34% R libR.so [.] RunGenCollect
0.20% R [kernel.kallsyms] [k] clear_page
它也消耗了大量内存:
USER PID VSZ RSS COMMAND
nate 17099 22.5GB 22.5GB /usr/local/lib/R/bin/exec/R --vanilla
如果矩阵较小(几MB)但迭代次数较多,则内存使用率更合理,但代价是垃圾收集器使用的时间比数值计算更多:
cols = 100
rows = 10000
w = runif(rows)
Q = matrix(1.0, rows, cols)
reweight(1000, w, Q)
(请注意,这是一个从头开始的新流程)
61.51% R libR.so [.] RunGenCollect
26.40% R libR.so [.] real_binary
7.94% R libR.so [.] SortNodes
2.79% R [kernel.kallsyms] [k] clear_page_c_e
USER PID VSZ RSS COMMAND
nate 17099 191MB 72MB /usr/local/lib/R/bin/exec/R --vanilla
如果我用Rcpp编写自己的函数来完成工作,我可以得到我想要的内存分配:
library(Rcpp)
cppFunction('
void weightMatrix(NumericVector w,
NumericMatrix Q,
NumericMatrix wQ) {
size_t numRows = Q.rows();
for (size_t row = 0; row < numRows; row++) {
wQ(row,_) = w(row) * Q(row,_);
}
return;
}
')
reweightCPP = function(iter, w, Q) {
# Initialize workspace to non-NA
wQ = matrix(1.0, nrow(Q), ncol(Q))
for (i in 1:iter) {
weightMatrix(w, Q, wQ)
}
}
cols = 100
rows = 10000
w = runif(rows)
Q = matrix(1.0, rows, cols)
wQ = matrix(NA, rows, cols)
Rprofmem()
reweightCPP(5, w, Q)
Rprofmem(NULL)
nate @ ubuntu:〜/ R $ less Rprofmem.out
8000040 :"matrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
2544 :"<Anonymous>" "weightMatrix" "reweightCPP"
(2544字节的分配是什么?似乎是一个Rcpp常数。有什么方法可以避免它吗?)
由于Rcpp糖,性能仍然不理想:
76.53% R sourceCpp_82335.so [.] _Z12weightMatrixN4Rcpp6VectorILi14ENS_15PreserveStorageEEENS_6MatrixILi14ES1_EES4_
10.46% R libR.so [.] Rf_getAttrib
9.53% R libR.so [.] getAttrib0
2.06% R libR.so [.] Rf_isMatrix
0.42% R libR.so [.] INTEGER
但我可以通过使用较低级别的C ++来解决这个问题:
cppFunction('
void weightMatrix(NumericVector w_,
NumericMatrix Q_,
NumericMatrix wQ_) {
size_t numCols = Q_.ncol();
size_t numRows = Q_.nrow();
double * __restrict__ w = &w_[0];
double * __restrict__ Q = &Q_[0];
double * __restrict__ wQ = &wQ_[0];
for (size_t row = 0; row < numRows; row++) {
size_t colOffset = 0;
for (size_t col = 0; col < numCols; col++) {
wQ[colOffset + row] = w[row] * Q[colOffset + row];
colOffset += numRows;
}
}
return;
}
')
99.18% R sourceCpp_59392.so [.] sourceCpp_48203_weightMatrix
0.06% R libR.so [.] PutRNGstate
0.06% R libR.so [.] do_begin
0.06% R libR.so [.] Rf_eval
尽管如此,我仍然没有想到让编译器可靠地生成有效的汇编而不需要使用SIMD内在函数来强制使用VMULPD。即使是丑陋的__限制__&#39;属性,在这里显示的形式似乎被迫反转我的循环顺序,并做了很多不必要的工作。但据推测,我最终会发现神奇的交叉编译器语法,或者更有可能,它会调用Fortran BLAS函数。
这让我想到了我的问题:
有没有什么方法可以获得我想要的性能而不会遇到所有这些麻烦?如果不这样做,我有什么方法可以至少隐藏在幕后,以便R中的最终用户可以使用&#34; wQ = w * Q&#34;并让它神奇地重用wQ而不是分配和丢弃另一个巨大的矩阵?
对于可以将答案写入其中一个操作数(Q = w * Q)的情况,R中的BLAS包装器似乎做得相当不错,但我还没有找到任何方法来执行此操作我需要一个&#34;第三方&#34;工作区。是否有一些合理的方法来定义%=%的方法,它将转换&#34; wQ = w * Q&#34; to&#34; op_mult(w,Q,wQ)&#34;?
先解决问题是否重要:是的,我已经衡量过,而且很重要。用例是处理大型纵向数据(http://cran.r-project.org/web/packages/ltmle/ltmle.pdf)的循环内的交叉验证逻辑回归的集合。每次分析将被称为数百万(如果不是数十亿)次。这个函数的一个很好的优化将有助于从不可能的&#34;中获得运行时间。到#34;天&#34;。一个伟大的优化(或者更确切地说是几个这样的优化的组合)可能会让它延长到&#34;小时&#34;甚至&#34;分钟&#34;。
编辑:在评论中,Henrik正确指出示例循环已经简化到只是重复多次相同的计算。我希望这会集中讨论这个问题,但也许它会让人感到困惑。在真实版本中,循环中会有更多步骤,以便&#39; w&#39;在&#39; w * Q&#39;每次迭代都不同。下面是一个测试不佳的实际功能草案版本。这是一个&#34;半优化&#34;基于O'Leary's QR Newton IRLS described by Bryan Lewis的直R的逻辑回归。
logistic_irls_qrnewton = function(A, y, maxIter=25, targetSSE=1e-16) {
# warn user below on first weight less than threshold
tinyWeightsFound = FALSE
tiny = sqrt(.Machine$double.eps)
# decompose A to QR (only once, Choleski done in loop)
QR = qr(A) # A[rows=samples, cols=covariates]
Q = qr.Q(QR) # Q[rows, cols] (same dimensions as A)
R = qr.R(QR) # R[cols, cols] (upper right triangular)
# copying now prevents copying each time y is used as argument
y = y + 0; # y[rows]
# first pass is outside loop since initial values are constant
iter = 1
t = (y - 0.5) * 4.0 # t[rows] = (y - m) * initial weight
C = chol(crossprod(Q, Q)) # C[rows, rows]
t = crossprod(Q,t)
s = forwardsolve(t(C), t) # s[cols]
s = backsolve(C, s))
t = Q %*% s
sse = crossprod(s) # sum of squared errors
print(as.vector(sse))
converged = ifelse(sse < targetSSE, 1, 0)
while (converged == 0 && iter < maxIter) {
iter = iter + 1
# only t is required as an input
dim(t) = NULL # matrix to vector to counteract crossprod
e = exp(t)
m = e / (e + 1) # mu = exp(eta) / (1 + exp(eta))
d = m / (e + 1) # mu.eta = exp(eta) / (1 + exp(eta))^2
w = d * d / (m - m^2) # W = (1 / variance) = 1 / (mu * (1 - mu))
if(tinyWeightsFound == FALSE && min(w) < tiny) {
print("Tiny weights found")
tinyWeightsFound = TRUE
}
t = crossprod(Q, w * (((y - m) / d) + t))
C = chol(crossprod(Q, w * Q))
n = forwardsolve(t(C), t)
n = backsolve(C, n)
t = Q %*% n
sse = crossprod(n - s) # divergence from previous
s = n # save divergence for difference from next
print(as.vector(sse))
if (sse < targetSSE) converged = iter
}
if (converged == 0) {
print(paste("Failed to converge after", iter, "iterations"))
print(paste("Final SSE was", sse))
} else {
print(paste("Convergence after iteration", iter))
}
coefficients = backsolve(R, crossprod(Q,t))
dim(coefficients) = NULL # return as a vector
coefficients
}