加快矢量创作

时间:2015-11-09 08:54:04

标签: r data.table

我需要创建一个向量,其中所有数字都在表中定义的范围内。例如,行23:25和34:39将成为单个向量c(23, 24, 25, 34, 35, 36, 37, 38, 39)

下面的MWE会这样做,但速度太慢了。我需要做15,000,000或更高的n.rows。

row.references是输入。 row.references.long是想要的输出。

有什么更好的代码可以做到这一点?

library(data.table)
# Create example data
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
row.references.long <- NULL
# The too-slow code
for (i in 1:nrow(row.references)) {
  row.references.long <- rbind(row.references.long, data.table(row.references[i, start.number]:row.references[i, end.number]))
}

我认为data.table是要走的路。

4 个答案:

答案 0 :(得分:9)

出于某种原因,这仍然让我感到沮丧。我不确定使用by = 1:nrow(indt)是否有很大的劣势,但这给了我很好的表现。

我建议&#34; data.table&#34;只会是:

row.references[, list(V1 = start.number:end.number), 
               by = 1:nrow(row.references)]$V1

对于基地R,将是:

unlist(mapply(":", row.references$start.number, row.references$end.number), 
         use.names = FALSE)

后者类似于Roland的方法,但只使用:unlist代替do.call(c, ...)

基准

以下是您的示例数据:

library(data.table)
set.seed(1)
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), 
                             steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]

以下是一些可以试用的功能:

AM1 <- function() {
  unlist(mapply(":", row.references$start.number, row.references$end.number), 
         use.names = FALSE)
}

AM2 <- function() {
  row.references[, list(V1 = start.number:end.number), 
                 by = 1:nrow(row.references)]$V1
}

roland1 <- function() {
  do.call(c, mapply(seq, 
                    row.references[["start.number"]], 
                    row.references[["end.number"]], 
                    MoreArgs = list(by = 1)))
}

roland2 <- function(indt = copy(row.references)) {
  indt[, lengths := end.number - start.number + 1]
  res <- indt[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
  res[, V1 := V1 + seq_along(V1), 
      by = rep(seq_len(nrow(indt)), indt[["lengths"]])]$V1
}

jaap <- function(indt = copy(row.references)) {
  indt[, `:=` (idx=.I)][, .(var = seq(start.number,end.number)), by = idx]$var
}

检查它们是否相等:

sapply(c(quote(AM2()), quote(roland1()), quote(roland2()), quote(jaap())), 
       function(x) all.equal(AM1(), eval(x)))
# [1] TRUE TRUE TRUE TRUE

现在,制作一些更大的数据:

# Make the data bigger -- 2.5 million rows
row.references <- rbindlist(replicate(2500, row.references, FALSE))
dim(row.references)

测试时间:

system.time(AM1())
#    user  system elapsed 
#   6.936   0.000   6.845 

system.time(AM2())
#    user  system elapsed 
#   2.480   0.212   2.800 

system.time(roland1())
#    user  system elapsed 
#  64.932   0.000  63.525 

system.time(roland2())
#    user  system elapsed 
#   3.488   0.000   2.434 

system.time(jaap())
#    user  system elapsed 
#  14.068   0.000  13.643 

似乎roland2AM2是可行的选择。即使这个&#34; microbenchmark&#34;有点偏,我觉得AM2在可读性方面胜过:

library(microbenchmark)
microbenchmark(AM2(), roland2(), times = 20)
# Unit: seconds
#        expr      min       lq     mean   median       uq      max neval
#       AM2() 2.202286 2.236027 2.323602 2.320230 2.394856 2.477074    20
#   roland2() 2.314997 2.428790 2.502338 2.477764 2.589151 2.700195    20

答案 1 :(得分:4)

不要在循环中生长对象。预分配。这是一个更高效的循环版本:

res <- do.call(c, mapply(seq, 
                        row.references[["start.number"]], 
                        row.references[["end.number"]], 
                        MoreArgs = list(by = 1)))
all.equal(res, row.references.long[[1]])
#[1] TRUE

这是另一种选择。基准测试是否更快。

row.references[, lengths := end.number - start.number + 1]
res <- row.references[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1), 
    by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])]
all.equal(res, row.references.long)
#[1] TRUE

但是,我会在编译代码中执行此操作,即使用Rcpp。

答案 2 :(得分:4)

正如@Roland所提到的,不需要for循环。您可以在索引(idx)列的帮助下完全在 data.table 中完成此操作:

set.seed(12)
row.ref <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
                      steps=floor(runif(n=n.rows, min=1, max=50)))
row.ref[, `:=` (end.number=start.number+steps, idx=.I)]

row.ref.l <- row.ref[, .(var = seq(start.number,end.number)), by = idx][, idx:=NULL]

导致:

> head(row.ref,3)
   start.number steps end.number idx
1:            7     3         10   1
2:           81     8         89   2
3:           94    40        134   3

> head(row.ref.l,15)
    var
 1:   7
 2:   8
 3:   9
 4:  10
 5:  81
 6:  82
 7:  83
 8:  84
 9:  85
10:  86
11:  87
12:  88
13:  89
14:  94
15:  95

几个提出的解决方案的基准:

microbenchmark(jaap = row.references[, .(var = seq(start.number,end.number)), by = idx],
               roland1 = do.call(c, mapply(seq,row.references[["start.number"]],row.references[["end.number"]],MoreArgs = list(by = 1))),
               roland2 = row.references[, lengths := end.number - start.number + 1][, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))][, V1 := V1 + seq_along(V1), by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])],
               ananda = data.table(unlist(mapply(":", row.references$start.number, row.references$end.number), use.names = FALSE)),
               times = 100, unit = "relative")

给出:

Unit: relative
    expr       min        lq      mean    median        uq      max neval  cld
    jaap  5.517431  5.358356  5.168787  5.183828  5.164292 2.157907   100   c 
 roland1 19.023350 18.029892 16.897831 17.475423 17.111857 5.682705   100    d
 roland2  2.051662  2.015041  1.912261  1.964078  1.957416 1.277075   100  b  
  ananda  1.000000  1.000000  1.000000  1.000000  1.000000 1.000000   100 a   

答案 3 :(得分:1)

对于避免显式迭代的一种方法,创建一个跨越整个范围的序列(使用数字向量来避免整数溢出),然后通过所需的偏移来校正元素,使元素对应于每个序列的开头,等于序列的开始。

f <- function(start, step) {
    res <- seq(1, sum(step + 1), by=1)
    offset <- start -  c(0, cumsum(step + 1)[-length(step)]) - 1L
    res + rep(offset, step + 1)
}