我需要创建一个向量,其中所有数字都在表中定义的范围内。例如,行23:25和34:39将成为单个向量c(23, 24, 25, 34, 35, 36, 37, 38, 39)
下面的MWE会这样做,但速度太慢了。我需要做15,000,000或更高的n.rows。
row.references
是输入。 row.references.long
是想要的输出。
有什么更好的代码可以做到这一点?
library(data.table)
# Create example data
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
row.references.long <- NULL
# The too-slow code
for (i in 1:nrow(row.references)) {
row.references.long <- rbind(row.references.long, data.table(row.references[i, start.number]:row.references[i, end.number]))
}
我认为data.table
是要走的路。
答案 0 :(得分:9)
出于某种原因,这仍然让我感到沮丧。我不确定使用by = 1:nrow(indt)
是否有很大的劣势,但这给了我很好的表现。
我建议&#34; data.table&#34;只会是:
row.references[, list(V1 = start.number:end.number),
by = 1:nrow(row.references)]$V1
对于基地R,将是:
unlist(mapply(":", row.references$start.number, row.references$end.number),
use.names = FALSE)
后者类似于Roland的方法,但只使用:
和unlist
代替do.call(c, ...)
以下是您的示例数据:
library(data.table)
set.seed(1)
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
以下是一些可以试用的功能:
AM1 <- function() {
unlist(mapply(":", row.references$start.number, row.references$end.number),
use.names = FALSE)
}
AM2 <- function() {
row.references[, list(V1 = start.number:end.number),
by = 1:nrow(row.references)]$V1
}
roland1 <- function() {
do.call(c, mapply(seq,
row.references[["start.number"]],
row.references[["end.number"]],
MoreArgs = list(by = 1)))
}
roland2 <- function(indt = copy(row.references)) {
indt[, lengths := end.number - start.number + 1]
res <- indt[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1),
by = rep(seq_len(nrow(indt)), indt[["lengths"]])]$V1
}
jaap <- function(indt = copy(row.references)) {
indt[, `:=` (idx=.I)][, .(var = seq(start.number,end.number)), by = idx]$var
}
检查它们是否相等:
sapply(c(quote(AM2()), quote(roland1()), quote(roland2()), quote(jaap())),
function(x) all.equal(AM1(), eval(x)))
# [1] TRUE TRUE TRUE TRUE
现在,制作一些更大的数据:
# Make the data bigger -- 2.5 million rows
row.references <- rbindlist(replicate(2500, row.references, FALSE))
dim(row.references)
测试时间:
system.time(AM1())
# user system elapsed
# 6.936 0.000 6.845
system.time(AM2())
# user system elapsed
# 2.480 0.212 2.800
system.time(roland1())
# user system elapsed
# 64.932 0.000 63.525
system.time(roland2())
# user system elapsed
# 3.488 0.000 2.434
system.time(jaap())
# user system elapsed
# 14.068 0.000 13.643
似乎roland2
和AM2
是可行的选择。即使这个&#34; microbenchmark&#34;有点偏,我觉得AM2
在可读性方面胜过:
library(microbenchmark)
microbenchmark(AM2(), roland2(), times = 20)
# Unit: seconds
# expr min lq mean median uq max neval
# AM2() 2.202286 2.236027 2.323602 2.320230 2.394856 2.477074 20
# roland2() 2.314997 2.428790 2.502338 2.477764 2.589151 2.700195 20
答案 1 :(得分:4)
不要在循环中生长对象。预分配。这是一个更高效的循环版本:
res <- do.call(c, mapply(seq,
row.references[["start.number"]],
row.references[["end.number"]],
MoreArgs = list(by = 1)))
all.equal(res, row.references.long[[1]])
#[1] TRUE
这是另一种选择。基准测试是否更快。
row.references[, lengths := end.number - start.number + 1]
res <- row.references[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1),
by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])]
all.equal(res, row.references.long)
#[1] TRUE
但是,我会在编译代码中执行此操作,即使用Rcpp。
答案 2 :(得分:4)
正如@Roland所提到的,不需要for循环。您可以在索引(idx
)列的帮助下完全在 data.table 中完成此操作:
set.seed(12)
row.ref <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
steps=floor(runif(n=n.rows, min=1, max=50)))
row.ref[, `:=` (end.number=start.number+steps, idx=.I)]
row.ref.l <- row.ref[, .(var = seq(start.number,end.number)), by = idx][, idx:=NULL]
导致:
> head(row.ref,3)
start.number steps end.number idx
1: 7 3 10 1
2: 81 8 89 2
3: 94 40 134 3
> head(row.ref.l,15)
var
1: 7
2: 8
3: 9
4: 10
5: 81
6: 82
7: 83
8: 84
9: 85
10: 86
11: 87
12: 88
13: 89
14: 94
15: 95
几个提出的解决方案的基准:
microbenchmark(jaap = row.references[, .(var = seq(start.number,end.number)), by = idx],
roland1 = do.call(c, mapply(seq,row.references[["start.number"]],row.references[["end.number"]],MoreArgs = list(by = 1))),
roland2 = row.references[, lengths := end.number - start.number + 1][, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))][, V1 := V1 + seq_along(V1), by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])],
ananda = data.table(unlist(mapply(":", row.references$start.number, row.references$end.number), use.names = FALSE)),
times = 100, unit = "relative")
给出:
Unit: relative
expr min lq mean median uq max neval cld
jaap 5.517431 5.358356 5.168787 5.183828 5.164292 2.157907 100 c
roland1 19.023350 18.029892 16.897831 17.475423 17.111857 5.682705 100 d
roland2 2.051662 2.015041 1.912261 1.964078 1.957416 1.277075 100 b
ananda 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
答案 3 :(得分:1)
对于避免显式迭代的一种方法,创建一个跨越整个范围的序列(使用数字向量来避免整数溢出),然后通过所需的偏移来校正元素,使元素对应于每个序列的开头,等于序列的开始。
f <- function(start, step) {
res <- seq(1, sum(step + 1), by=1)
offset <- start - c(0, cumsum(step + 1)[-length(step)]) - 1L
res + rep(offset, step + 1)
}