我需要将大矩阵转换为特定格式以与libsvm一起使用。格式包括使用标签(1或-1)开始每一行,后跟0: row_number ,行值为1: value_at_row_number_1st_column
下面给出的简单方法太慢了,
require(microbenchmark)
nR = 100; nC = 500
kMat = matrix(runif(nR*nC), nrow=nR)
yLab = sample(c(1, -1), nR, replace = T)
# Simple method
met1 = function() {
lines = c()
for(ix in 1:nrow(kMat))
lines = c(lines,
paste(yLab[ix],
paste0("0:", ix),
paste0(1:ncol(kMat), ":", kMat[ix, ], collapse=" ")))
lines
}
我的版本速度提高了约50%(虽然方式更加丑陋),
# Sprintf
met2 = function() {
fmt = c("%i", "0:%i", paste0(1:ncol(kMat), ":%f"))
kMat = cbind(yLab, 1:nrow(kMat), kMat)
# Unfortunately sprintf cannot handle more than 100 arguments
splts = lapply(seq(1, length(fmt), 99L),
function(ix) {
r = ix:min(ncol(kMat), ix+98L)
list(range = r, fmt = list(paste(fmt[r], collapse = " ")) )
})
lines = sapply(1:nrow(kMat),
function(ix) {
Reduce(function(a, b) sprintf("%s %s", a, b),
sapply(splts,
function(s){
do.call(sprintf, c(s$fmt, kMat[ix, s$range]))
}),
"")
})
lines
}
print(microbenchmark(met1(), met2()))
Unit: milliseconds
expr min lq mean median uq max neval
met1() 85.83051 88.00289 92.01948 88.61834 90.31918 175.3362 100
met2() 44.81729 45.61020 56.12835 54.75313 56.65249 108.7218 100
是否有更快(或更整洁)的方式来处理这种格式?
答案 0 :(得分:0)
这既短又快。似乎sprintf
从double
到character
的转换比隐式as.character
更快,而stringi::stri_join
比paste
加快了一些速度1}}和paste0
。我也尝试过删除转置的变体,但下面的代码更快。
library(stringi)
met3 <- function() {
s <- stri_join("0:", seq_len(nC), " ", sprintf("%f", t(kMat)))
m <- matrix(s, nC)
stri_join(yLab, apply(m, 2, stri_join, collapse = " "), sep = " ")
}
,并提供:
> microbenchmark(met3(), met2(), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
met3() 236.6127 255.1396 264.7797 256.6331 292.1037 296.6377 10
met2() 307.6371 322.1467 354.7281 332.0041 388.2474 464.2259 10