我想过滤行并将原始行号(源数据)存储在data.table
列中。
我知道.I
有一个语法变体(请参阅https://stackoverflow.com/a/23586059/4468078),但这需要我过滤两次我想要避免的内容:
DT <- mtcars
setDT(DT)
row.numbers <- DT[, .I[ gear > 4]]
# > row.numbers
# [1] 27 28 29 30 31
DT[row.numbers, .(row.numbers, gear)]
# row.numbers gear
# 1: 27 5
# 2: 28 5
# 3: 29 5
# 4: 30 5
# 5: 31 5
如果我使用普通.I
语法,则返回子集的行号,而不是原始数据:
DT[gear > 4, .(.I, gear)]
I gear
1: 1 5
2: 2 5
3: 3 5
4: 4 5
5: 5 5
有关更简单/优化解决方案的想法吗?
答案 0 :(得分:2)
编辑2:添加了@Frank的w2
变种...
为了补充@UweBlock的接受答案,我已经做了一些基准测试,我想在这里展示以分享结果:
library(data.table)
library(microbenchmark)
# size: about 800 MB
DT <- data.table(x = sample(1000, 1E8, replace = TRUE), y = sample(1000, 1E8, replace = TRUE))
LIMIT <- 500
microbenchmark(row.filter = {
row.numbers <- DT[, .I[x > LIMIT]]
res <- DT[row.numbers, .(row.numbers, x, y)]
},
chaining = {
res <- DT[, row.number := .I][x > LIMIT, .(row.number, x, y)]
},
w2 = {
w = DT[x > LIMIT, which = TRUE ]
DT[w, c("x","y")][, w := w ]
},
times = 20)
结果是:
Unit: seconds
expr min lq mean median uq max neval cld
row.filter 2.039627 2.152165 2.290105 2.284775 2.381632 2.652203 20 b
chaining 2.032791 2.272493 2.369759 2.359630 2.472028 2.777191 20 b
w2 1.104414 1.194826 1.274428 1.257893 1.311050 1.557225 20 a
编辑1:要比较过滤器选择性的影响:
LIMIT <- 100
:
Unit: seconds
expr min lq mean median uq max neval cld
row.filter 3.254134 3.638193 4.053991 3.865599 4.432278 5.337939 20 b
chaining 3.005504 3.874443 4.116179 4.069974 4.391666 4.994020 20 b
w2 1.289617 1.588608 1.965523 1.962185 2.294457 2.859689 20 a
对于LIMIT <- 900
:
Unit: milliseconds
expr min lq mean median uq max neval cld
row.filter 900.9504 905.0694 914.9406 907.5211 916.2071 964.6856 20 b
chaining 927.1630 932.0981 965.8222 970.9336 981.5885 1030.6396 20 c
w2 607.0091 609.8028 620.5582 612.0490 615.2337 669.9706 20 a
答案 1 :(得分:2)
.(rn, gear)
切换到c("rn", "gear")
可能会加快速度,但没有单独对此进行基准测试。LIMIT <- 500
的类型为 double 。 data.table
偶尔会警告类型转换,所以我想知道在这种情况下类型转换可能会对性能产生什么影响。到目前为止,已经提供了3个答案,这些答案构成了五种代码变体:
不幸的是,我无法让 row.filter 在SE版本中工作。
LIMIT
的不同值:100,500,900 LIMIT
:整数, double 以测试类型转换的效果重复次数是根据问题大小计算的,最少运行3次,最多运行100次。
类型转换的成本约为4%(中位数)至9%(平均值)。因此,如果您使用LIMIT <- 500
编写LIMIT <- 500L
或L
来表示整数常量,那么确实重要。
使用非标准评估的性能损失要高得多:两种方法的NSE平均需要的时间比SE平均多50%。
(请注意,下图仅显示 integer )
下面的图表限制 500 并输入整数表示对于所有问题规模,SE变体比其NSE对应物更快。有趣的是, chaining_se 似乎比 which_se 略有优势,对于较小的问题大小最多5000行,而对于大于5 M行的问题大小 which_se 是显然更快。
根据要求,这是一个表格,显示上面图表的时间单位:
dcast(bm_med[limit == 500L & type == "int"][
, expr := forcats::fct_reorder(factor(expr), -time)],
expr ~ n_rows, fun.aggregate = function(x) max(x/1E6), value.var = "time")
expr 100 1000 10000 1e+05 1e+06 1e+07 1e+08 1: chaining_nse 0.8189745 0.8493695 1.0115405 2.870750 22.34469 441.1621 2671.179 2: row.filter 0.7693225 0.7972635 0.9622665 2.677807 21.30861 247.3984 2677.495 3: which_nse 0.8486145 0.8690035 1.0117295 2.620980 18.39406 219.0794 2341.990 4: chaining_se 0.5299360 0.5582545 0.6454755 1.700626 12.48982 166.0164 2049.904 5: which_se 0.5894045 0.6114935 0.7040005 1.624166 13.00125 130.0718 1289.050
library(data.table)
library(microbenchmark)
run_bm <- function(n_rows, limit = 500L, type = "int") {
set.seed(1234L)
DT <- data.table(x = sample(1000, n_rows, replace = TRUE),
y = sample(1000, n_rows, replace = TRUE))
LIMIT <- switch(type,
int = as.integer(limit),
dbl = as.double(limit))
times <- round(scales::squish(sqrt(1E8 / n_rows) , c(3L, 100L)))
cat("Start run:", n_rows, limit, type, times, "\n")
microbenchmark(row.filter = {
row.numbers <- DT[, .I[x > LIMIT]]
DT[row.numbers, .(row.numbers, x, y)]
},
chaining_nse = {
DT[, row.number := .I][x > LIMIT, .(row.number, x, y)]
},
chaining_se = {
DT[, row.number := .I][x > LIMIT, c("row.number", "x", "y")]
},
which_nse = {
row.numbers <- DT[x > LIMIT, which = TRUE ]
DT[row.numbers, .(x, y)][, row.numbers := row.numbers ][]
},
which_se = {
row.numbers <- DT[x > LIMIT, which = TRUE ]
DT[row.numbers, c("x", "y")][, row.numbers := row.numbers][]
},
times = times)
}
# parameter
bm_par <- CJ(n_rows = 10^seq(2L, 8L, 1L),
limit = seq(100L, 900L, 400L),
type = c("int", "dbl"))
# run the benchmarks
bm_raw <- bm_par[, run_bm(n_rows, limit, type), by = .(n_rows, limit, type)]
# aggregate results
bm_med <- bm_raw[, .(time = median(time)), by = .(n_rows, limit, type, expr)]
library(ggplot2)
# chart 1
ggplot(
dcast(bm_med, n_rows + limit + expr ~ type, value.var = "time")[
, ratio := dbl / int - 1.0] #[limit == 500L]
) +
aes(n_rows, ratio, colour = expr) +
geom_point() +
geom_line() +
facet_grid(limit ~ expr) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_continuous(labels = scales::percent) +
coord_cartesian(ylim = c(-0.1, 0.5)) +
geom_hline(yintercept = 0) +
theme_bw() +
ggtitle("Performance loss due to type conversion") +
ylab("Relative computing time dbl vs int") +
xlab("Number of rows (log scale)")
ggsave("p2.png")
# chart 2
ggplot(
dcast(bm_med[, c("code", "eval") := tstrsplit(expr, "_")][!is.na(eval)],
n_rows + limit + type + code ~ eval, value.var = "time")[
, ratio := nse / se - 1.0][type == "int"]
) +
aes(n_rows, ratio, colour = code) +
geom_point() +
geom_line() +
facet_grid(limit + type ~ code) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = 0) +
theme_bw() +
ggtitle("Performance loss due to non standard evaluation") +
ylab("Relative computing time NSE vs SE") +
xlab("Number of rows (log scale)")
ggsave("p3.png")
# chart 3
ggplot(bm_med[limit == 500L][type == "int"]) +
aes(n_rows, time/1E6, colour = expr) +
geom_point() +
geom_smooth(se = FALSE) +
facet_grid(limit ~ type) +
facet_grid(type ~ limit) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_log10(labels = function(x) scales::math_format()(log10(x))) +
theme_bw() +
ggtitle("Benchmark results (log-log scale)") +
ylab("Computing time in ms (log scale)") +
xlab("Number of rows (log scale)")
ggsave("p1.png")
答案 2 :(得分:1)
您可以在过滤前添加一列行号:
library(data.table)
data.table(mtcars)[, rn := .I][gear > 4, .(rn, gear)]
rn gear 1: 27 5 2: 28 5 3: 29 5 4: 30 5 5: 31 5
只是一个快速的基准测试,mtcars
数据集(32行)远远不够小,但重点是开销。
microbenchmark::microbenchmark(
copy = DT <- data.table(mtcars),
ryoda = {
DT <- data.table(mtcars)
row.numbers <- DT[, .I[ gear > 4]]
DT[row.numbers, .(row.numbers, gear)]
},
uwe = {
DT <- data.table(mtcars)
DT[, rn := .I][gear > 4, .(rn, gear)]
},
times = 1000L
)
Unit: microseconds expr min lq mean median uq max neval cld copy 691.710 727.192 803.235 749.385 785.428 15989.293 1000 a ryoda 1821.869 1883.479 2001.653 1930.213 2011.124 6650.497 1000 b uwe 1860.288 1934.191 2053.004 1987.927 2077.370 5908.892 1000 c
请注意,每个基准测试运行都是使用DT的新副本启动的,因为其中一个代码正在修改DT
(使用:=
)。
这里,对于32行小样本数据集,似乎存在50到60微秒的惩罚。具有800M行的大数据集的R Yoda's benchmark results在两个方向上显示约1%的差异,这取决于经过滤的行的数量,即row.numbers
的长度。
答案 3 :(得分:1)
在@ RYoda的回答中,示例要快一点:
w = DT[x > LIMIT, which = TRUE ]
DT[w, c("x","y")][, w := w ]
要更改结果中列的顺序,setcolorder
应该可以正常工作,几乎没有时间。