更新：以1M行的数据表为基准

Question

我有一个data.table，其中包含3个日期变量：year，start，end。

test <- data.table(year=2001:2003,start=c(2003,2002,2000),end=c(2003,2004,2002),x_desired=c(F,T,F))

O想要创建一个新变量x，对于每行，指示year是否在start和end定义的范围内。正确的期望结果在变量x_desired中。

我想这可以通过以下方式完成：

test[,x:=(year %in% start:end)]

，但结果显然不正确。我想逐行定义范围，但不知道如何表达。

Answer 1

另一种方法

#first, create a x-column with all FALSE
DT[, x := FALSE ]
#update the x-column subset where year is between start and end to TRUE
DT[ year %between% list(start,end), x := TRUE]

应该快速运行...基准很快就会出现

更新：以1M行的数据表为基准

n = 1000000
set.seed(123)
dt <- data.table(year =sample( 2001:2003, n, replace = TRUE),
                 start=sample( c(2003,2002,2000), n, replace = TRUE),
                 end  =sample( c(2003,2004,2002), n, replace = TRUE) )

microbenchmark::microbenchmark( 
  wimpel = {
    DT <- copy(dt) 
    DT[, x := FALSE ]
    DT[ year %between% list(start,end), x := TRUE] 
    },
  akrun_nrow = {
    DT <- copy(dt)
    DT[, x := between(year, start, end), 1:nrow(DT)]
    },
  akrun_map = {
    DT <- copy(dt)
    DT[, x := unlist(do.call(Map, c(f = between, unname(.SD)))), .SDcols = year:end]
    },
  akrun_pmap = {
    DT <- copy(dt)
    DT[, x := purrr::pmap_lgl(.SD[, .(x = year, left = start, right = end)], between)]
    },
  markus = {
    DT <- copy(dt)
    DT[, col := mapply(between, year, start, end)]
  },
  times = 3
  )

结果

Unit: milliseconds
       expr        min         lq       mean     median         uq        max neval
     wimpel   29.98388   30.41861   48.98399   30.85333   58.48404   86.11475     3
 akrun_nrow 2741.35268 2755.01860 2944.58975 2768.68453 3046.20829 3323.73206     3
  akrun_map 3673.21253 3683.22849 3711.51209 3693.24446 3730.66188 3768.07929     3
 akrun_pmap 3281.13335 3291.04689 3406.46131 3300.96043 3469.12528 3637.29013     3
     markus 3408.07869 3569.33044 3670.68141 3730.58219 3801.98277 3873.38334     3

似乎有明显的赢家..但也许我在这里错过了什么？

Answer 2

另一种方式

set(DT, NULL, "x", between(DT$year, DT$start, DT$end))

基准

library(data.table)
setDTthreads(40L)
n = 1e9
set.seed(123)
DT = data.table(year =sample( 2001:2003, n, replace = TRUE),
                start=sample( c(2003,2002,2000), n, replace = TRUE),
                end  =sample( c(2003,2004,2002), n, replace = TRUE) )
d = copy(DT)

system.time({DT[, x := FALSE ]; DT[ year %between% list(start,end), x := TRUE]})
system.time(set(d, NULL, "x", between(DT$year, DT$start, DT$end)))

all.equal(d, DT)

时间

1e6

> system.time({DT[, x := FALSE ]; DT[ year %between% list(start,end), x := TRUE]})
   user  system elapsed 
  0.433   0.056   0.053 
> system.time(set(d, NULL, "x", between(DT$year, DT$start, DT$end)))
   user  system elapsed 
  0.152   0.000   0.025 

1e8

> system.time({DT[, x := FALSE ]; DT[ year %between% list(start,end), x := TRUE]})
   user  system elapsed 
  3.811   1.889   3.061 
> system.time(set(d, NULL, "x", between(DT$year, DT$start, DT$end)))
   user  system elapsed 
  2.650   1.112   2.132 

1e9

> system.time({DT[, x := FALSE ]; DT[ year %between% list(start,end), x := TRUE]})
   user  system elapsed 
 32.073  32.600  27.347 
> system.time(set(d, NULL, "x", between(DT$year, DT$start, DT$end)))
   user  system elapsed 
 21.798   8.517  18.248

Answer 3

一个选项是between

test[, x := between(year, start, end), 1:nrow(test)]
test
#   year start  end x_desired     x
#1: 2001  2003 2003     FALSE FALSE
#2: 2002  2002 2004      TRUE  TRUE
#3: 2003  2000 2002     FALSE FALSE

test[, x := year >= start & year <= end]

或者另一个选择是Map

test[, x := unlist(do.call(Map, c(f = between, unname(.SD)))), .SDcols = year:end]

或者使用pmap中的purrr

library(purrr)
test[, x := pmap_lgl(.SD[, .(x = year, left = start, right = end)], between)]

基准

在新选项上添加了基准（使用与@Wimpel的大数据相同的数据集）

microbenchmark(

 wimpel = {
    DT <- copy(dt) 
    DT[, x := FALSE ]
    DT[ year %between% list(start,end), x := TRUE] 
    },

    akrun = {
    DT <- copy(dt)
    DT[, x := year >= start & year <= end]
    }, times = 3)
# Unit: milliseconds
#   expr      min       lq     mean   median       uq      max neval
# wimpel 23.25196 40.72112 49.29130 58.19027 62.31098 66.43168     3
#  akrun 19.56071 22.04272 22.96553 24.52473 24.66793 24.81114     3

创建标志，指示年变量是否在data.table中的start：end变量范围内

3 个答案:

更新：以1M行的数据表为基准

基准