Question

我想过滤行并将原始行号（源数据）存储在data.table列中。

我知道.I有一个语法变体（请参阅https://stackoverflow.com/a/23586059/4468078），但这需要我过滤两次我想要避免的内容：

DT <- mtcars
setDT(DT)
row.numbers <- DT[, .I[ gear > 4]]
# > row.numbers
# [1] 27 28 29 30 31
DT[row.numbers, .(row.numbers, gear)]
#    row.numbers gear
# 1:          27    5
# 2:          28    5
# 3:          29    5
# 4:          30    5
# 5:          31    5

如果我使用普通.I语法，则返回子集的行号，而不是原始数据：

DT[gear > 4, .(.I, gear)]

   I gear
1: 1    5
2: 2    5
3: 3    5
4: 4    5
5: 5    5

有关更简单/优化解决方案的想法吗？

Answer 1

编辑2：添加了@Frank的w2变种...

为了补充@UweBlock的接受答案，我已经做了一些基准测试，我想在这里展示以分享结果：

library(data.table)
library(microbenchmark)
# size: about 800 MB
DT <- data.table(x = sample(1000, 1E8, replace = TRUE), y = sample(1000, 1E8, replace = TRUE))
LIMIT <- 500

microbenchmark(row.filter = {
  row.numbers <- DT[, .I[x > LIMIT]]
  res <- DT[row.numbers, .(row.numbers, x, y)]
},
chaining = {
  res <- DT[, row.number := .I][x > LIMIT, .(row.number, x, y)]
},
w2 = {
  w = DT[x > LIMIT, which = TRUE ]
  DT[w, c("x","y")][, w := w ]
},
times = 20)

结果是：

Unit: seconds
       expr      min       lq     mean   median       uq      max neval cld
 row.filter 2.039627 2.152165 2.290105 2.284775 2.381632 2.652203    20   b
   chaining 2.032791 2.272493 2.369759 2.359630 2.472028 2.777191    20   b
         w2 1.104414 1.194826 1.274428 1.257893 1.311050 1.557225    20  a

编辑1：要比较过滤器选择性的影响：

LIMIT <- 100：

Unit: seconds
       expr      min       lq     mean   median       uq      max neval cld
 row.filter 3.254134 3.638193 4.053991 3.865599 4.432278 5.337939    20   b
   chaining 3.005504 3.874443 4.116179 4.069974 4.391666 4.994020    20   b
         w2 1.289617 1.588608 1.965523 1.962185 2.294457 2.859689    20  a

对于LIMIT <- 900：

Unit: milliseconds
       expr      min       lq     mean   median       uq       max neval cld
 row.filter 900.9504 905.0694 914.9406 907.5211 916.2071  964.6856    20  b 
   chaining 927.1630 932.0981 965.8222 970.9336 981.5885 1030.6396    20   c
         w2 607.0091 609.8028 620.5582 612.0490 615.2337  669.9706    20 a

Answer 2

为什么要另一个基准？

弗兰克在comment中提到，从.(rn, gear)切换到c("rn", "gear")可能会加快速度，但没有单独对此进行基准测试。
在R yoda's benchmark中，示例数据的类型为 integer ，但LIMIT <- 500的类型为 double 。 data.table偶尔会警告类型转换，所以我想知道在这种情况下类型转换可能会对性能产生什么影响。

什么是基准？

到目前为止，已经提供了3个答案，这些答案构成了五种代码变体：

R Yoda's row.filter approach
Uwe's chaining approach有两种变体：非标准评估（NSE），以及Frank的建议，标准评估（SE）
Frank's approach using which也有两种变体：由Frank和NSE发布的SE用于速度比较。

不幸的是，我无法让 row.filter 在SE版本中工作。

使用了哪些参数？

问题规模（行数）：10 ²，10 ³，...，10 ⁸
LIMIT的不同值：100,500,900
类型LIMIT：整数， double 以测试类型转换的效果

重复次数是根据问题大小计算的，最少运行3次，最多运行100次。

结果

类型转换的成本约为4％（中位数）至9％（平均值）。因此，如果您使用LIMIT <- 500编写LIMIT <- 500L或L来表示整数常量，那么确实重要。

使用非标准评估的性能损失要高得多：两种方法的NSE平均需要的时间比SE平均多50％。
（请注意，下图仅显示 integer ）

下面的图表限制 500 并输入整数表示对于所有问题规模，SE变体比其NSE对应物更快。有趣的是， chaining_se 似乎比 which_se 略有优势，对于较小的问题大小最多5000行，而对于大于5 M行的问题大小 which_se 是显然更快。

根据要求，这是一个表格，显示上面图表的时间单位：

dcast(bm_med[limit == 500L & type == "int"][
  , expr := forcats::fct_reorder(factor(expr), -time)],
  expr ~ n_rows, fun.aggregate = function(x) max(x/1E6), value.var = "time")

           expr       100      1000     10000    1e+05    1e+06    1e+07    1e+08
1: chaining_nse 0.8189745 0.8493695 1.0115405 2.870750 22.34469 441.1621 2671.179
2:   row.filter 0.7693225 0.7972635 0.9622665 2.677807 21.30861 247.3984 2677.495
3:    which_nse 0.8486145 0.8690035 1.0117295 2.620980 18.39406 219.0794 2341.990
4:  chaining_se 0.5299360 0.5582545 0.6454755 1.700626 12.48982 166.0164 2049.904
5:     which_se 0.5894045 0.6114935 0.7040005 1.624166 13.00125 130.0718 1289.050

基准代码

library(data.table)
library(microbenchmark)
run_bm <- function(n_rows, limit = 500L, type = "int") {
  set.seed(1234L)
  DT <- data.table(x = sample(1000, n_rows, replace = TRUE), 
                   y = sample(1000, n_rows, replace = TRUE))
  LIMIT <- switch(type,
                  int = as.integer(limit),
                  dbl = as.double(limit))
  times <- round(scales::squish(sqrt(1E8 / n_rows) , c(3L, 100L)))
  cat("Start run:", n_rows, limit, type, times, "\n")
  microbenchmark(row.filter = {
    row.numbers <- DT[, .I[x > LIMIT]]
    DT[row.numbers, .(row.numbers, x, y)]
  },
  chaining_nse = {
    DT[, row.number := .I][x > LIMIT, .(row.number, x, y)]
  },
  chaining_se = {
    DT[, row.number := .I][x > LIMIT, c("row.number", "x", "y")]
  },
  which_nse = {
    row.numbers <- DT[x > LIMIT, which = TRUE ]
    DT[row.numbers, .(x, y)][, row.numbers := row.numbers ][]
  },
  which_se = {
    row.numbers <- DT[x > LIMIT, which = TRUE ]
    DT[row.numbers, c("x", "y")][, row.numbers := row.numbers][]
  },
  times = times)
}
# parameter
bm_par <- CJ(n_rows = 10^seq(2L, 8L, 1L), 
             limit = seq(100L, 900L, 400L), 
             type = c("int", "dbl"))
# run the benchmarks
bm_raw <- bm_par[, run_bm(n_rows, limit, type), by = .(n_rows, limit, type)]
# aggregate results
bm_med <- bm_raw[, .(time = median(time)), by = .(n_rows, limit, type, expr)]

图形代码

library(ggplot2)

# chart 1
ggplot(
  dcast(bm_med, n_rows + limit + expr ~ type, value.var = "time")[
    , ratio := dbl / int - 1.0] #[limit == 500L]
) + 
  aes(n_rows, ratio, colour = expr) +
  geom_point() + 
  geom_line() + 
  facet_grid(limit ~ expr) + 
  scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
  scale_y_continuous(labels = scales::percent) + 
  coord_cartesian(ylim = c(-0.1, 0.5)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  ggtitle("Performance loss due to type conversion") +
  ylab("Relative computing time dbl vs int") + 
  xlab("Number of rows (log scale)")
ggsave("p2.png")

# chart 2
ggplot(
  dcast(bm_med[, c("code", "eval") := tstrsplit(expr, "_")][!is.na(eval)], 
        n_rows + limit + type + code ~ eval, value.var = "time")[
          , ratio := nse / se - 1.0][type == "int"]
) + 
  aes(n_rows, ratio, colour = code) +
  geom_point() + 
  geom_line() + 
  facet_grid(limit  + type ~ code) + 
  scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
  scale_y_continuous(labels = scales::percent) + 
  geom_hline(yintercept = 0) +
  theme_bw() +
  ggtitle("Performance loss due to non standard evaluation") +
  ylab("Relative computing time NSE vs SE") + 
  xlab("Number of rows (log scale)")
ggsave("p3.png")

# chart 3
ggplot(bm_med[limit == 500L][type == "int"]) + 
  aes(n_rows, time/1E6, colour = expr) +
  geom_point() + 
  geom_smooth(se = FALSE) + 
  facet_grid(limit ~ type) +
  facet_grid(type ~ limit) +
  scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
  scale_y_log10(labels = function(x) scales::math_format()(log10(x))) +
  theme_bw() +
  ggtitle("Benchmark results (log-log scale)") +
  ylab("Computing time in ms (log scale)") + 
  xlab("Number of rows (log scale)")
ggsave("p1.png")

Answer 3

您可以在过滤前添加一列行号：

library(data.table)
data.table(mtcars)[, rn := .I][gear > 4, .(rn, gear)]

基准

只是一个快速的基准测试，mtcars数据集（32行）远远不够小，但重点是开销。

microbenchmark::microbenchmark(
  copy = DT <- data.table(mtcars),
  ryoda = {
    DT <- data.table(mtcars)
    row.numbers <- DT[, .I[ gear > 4]]
    DT[row.numbers, .(row.numbers, gear)]
  },
  uwe = {
    DT <- data.table(mtcars)
    DT[, rn := .I][gear > 4, .(rn, gear)]
  },
  times = 1000L
)

Unit: microseconds
  expr      min       lq     mean   median       uq       max neval cld
  copy  691.710  727.192  803.235  749.385  785.428 15989.293  1000 a  
 ryoda 1821.869 1883.479 2001.653 1930.213 2011.124  6650.497  1000  b 
   uwe 1860.288 1934.191 2053.004 1987.927 2077.370  5908.892  1000   c

请注意，每个基准测试运行都是使用DT的新副本启动的，因为其中一个代码正在修改DT（使用:=）。

这里，对于32行小样本数据集，似乎存在50到60微秒的惩罚。具有800M行的大数据集的R Yoda's benchmark results在两个方向上显示约1％的差异，这取决于经过滤的行的数量，即row.numbers的长度。

Answer 4

在@ RYoda的回答中，示例要快一点：

w = DT[x > LIMIT, which = TRUE ]
DT[w, c("x","y")][, w := w ]

要更改结果中列的顺序，setcolorder应该可以正常工作，几乎没有时间。

标识已过滤行的原始行数

4 个答案:

为什么要另一个基准？

什么是基准？

使用了哪些参数？

结果

基准代码

图形代码

基准