假设我有一个data.table,如下所示:
dt = data.table(group = c(1,1,1,2,2,2,3,3,3),time = c("2016-03-09T08:31:00-05:00","2016-03-08T11:31:00-05:00","2016-03-06T08:31:00-05:00",
"2016-04-04T23:28:00-04:00","2016-04-10T23:28:00-04:00","2016-04-09T23:28:00-04:00",
"2016-05-11T19:52:00-04:00","2016-05-10T20:52:00-04:00","2016-04-11T19:52:00-04:00"))
dt
group time
1: 1 2016-03-09T08:31:00-05:00
2: 1 2016-03-08T11:31:00-05:00
3: 1 2016-03-06T08:31:00-05:00
4: 2 2016-04-04T23:28:00-04:00
5: 2 2016-04-10T23:28:00-04:00
6: 2 2016-04-09T23:28:00-04:00
7: 3 2016-05-11T19:52:00-04:00
8: 3 2016-05-10T20:52:00-04:00
9: 3 2016-04-11T19:52:00-04:00
对于此data.table中的每个组,我只想保留最近一天24小时内的观察结果。我为此制定了一个讨厌的解决方案,但它并不像我需要它在大型数据集上那么快。
library(lubridate)
set(dt,j = "time",value = ymd_hms(dt[["time"]]))
dt[,.(mostRecent = max(time),time),by = group][
time > (mostRecent - days(1)),.(group,time)]
group time
1: 1 2016-03-09 13:31:00
2: 1 2016-03-08 16:31:00
3: 2 2016-04-11 03:28:00
4: 3 2016-05-11 23:52:00
5: 3 2016-05-11 00:52:00
有没有人有关于如何更优雅/更快地完成任务的提示?
答案 0 :(得分:4)
首先,将阈值放在表格中:
thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60][]
max
与减去日期的秒数分开,以利用the "GForce" optimized max
。另请参阅?datatable.optimize
。
接下来,进行滚动或非等距连接:
thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0]
# or, on data.table 1.9.7+
thresh_dt[dt, on=.(group, time <= time), nomatch=0]
group time
1: 1 2016-03-09 13:31:00
2: 1 2016-03-08 16:31:00
3: 2 2016-04-11 03:28:00
4: 2 2016-04-10 03:28:00
5: 3 2016-05-11 23:52:00
6: 3 2016-05-11 00:52:00
基准。 GForce max和滚动的优势仅在您有足够的群组时显示。我的示例数据扩展了@sbstn,因此组的数量是一个参数:
N = 5e6
ng = 1e5
all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'),
to = as.POSIXct('2016-06-30 10:00:00'),
by = 60)
all_times_int = as.integer(all_times)
idx = sample(seq.int(length(all_times)), N, replace = TRUE)
dt = data.table(group = sample(ng, N, replace = TRUE),
time = all_times[idx],
time_int = all_times_int[idx])
# sbstn, no gmax
system.time({
dt[, cutoff_time := max(time) - 24*60*60, by = group]
dt[time >= cutoff_time]
})
# user system elapsed
# 8.50 0.01 8.47
# sbstn, with gmax
system.time({
dt[, maxtime := max(time), by = group][, cutoff_time := maxtime - 24*60*60]
dt[time >= maxtime]
})
# user system elapsed
# 4.98 0.01 4.99
# gmax and roll
system.time({
thresh_dt = dt[, .(time = max(time)), by=group][, time := time - 24*60*60]
thresh_dt[dt, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)]
})
# user system elapsed
# 1.29 0.06 1.36
# (Caveat: I didn't verify that these results match.)
我的回答是将行分组两次(一次计算最大值,再次与原始数据连接)。通过将其归结为一个分组操作,克莱顿斯坦利的回答也变得很快(至少我认为这是什么&#39;)
system.time(dt[order(group, -time)
][, groupP := shift(group, type='lag')
][, head := is.na(groupP) | group != groupP
][, copy(.SD)[.SD[head == T], rTime := i.time, on=c(group='group')]
][time > (rTime - 24*60*60)
][, .(group, time)
][order(group, -time)
])
# user system elapsed
# 1.32 0.25 1.14
答案 1 :(得分:4)
为每个组创建截止时间的简单解决方案(假设时间已经转换):
dt[, cutoff_time := max(time) - 24*60*60, by = group]
dt[time > cutoff_time]
修改强>
评论&#34; GForce优化最大值&#34;让我好奇,所以我创建了一些更大的假数据,以便比较速度。请注意,integer
可以很好地与max
和>=
:
require(data.table)
require(microbenchmark)
N = 100000
N_g = 100
all_times = seq(from = as.POSIXct('2016-01-01 10:00:00'),
to = as.POSIXct('2016-06-30 10:00:00'),
by = 60)
all_times_int = as.integer(all_times)
idx = sample(seq.int(length(all_times)), N, replace = TRUE)
dt = data.table(group = sample(seq.int(N_g), N, replace = TRUE),
time = all_times[idx],
time_int = all_times_int[idx])
f1a = function (x) {
x[, cutoff_time := max(time) - 24*60*60, by = group]
x[time >= cutoff_time, list(group, time)]
}
f1b = function (x) {
x[, cutoff_time := max(time_int) - 24*60*60, by = group]
x[time_int >= cutoff_time, list(group, time)]
}
f2 = function (x) {
thresh_dt = x[, .(time = max(time)), by=group][, time := time - 24*60*60]
thresh_dt[x, on=c("group", "time"), roll=TRUE, nomatch=0][, list(group, time)]
}
microbenchmark(f1a(dt),
f1b(dt),
f2(dt))
Unit: milliseconds
expr min lq mean median uq max neval
f1a(dt) 9.842106 10.593243 11.593148 11.62311 12.478853 14.335338 100
f1b(dt) 3.391178 3.763598 4.403264 4.00142 5.018182 8.335717 100
f2(dt) 14.422669 15.701397 17.090674 16.56990 17.695653 52.926897 100
identical(f1a(dt), f1b(dt)) # TRUE
identical(f1a(dt), f2(dt)) # TRUE
编辑2:
还有一个N = 1,000,000
和N_g = 10,000
组:
> microbenchmark(f1a(dt),
+ f1b(dt),
+ f2(dt),
+ times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1a(dt) 634.91473 647.5662 670.74597 663.28238 694.29595 728.2481 10
f1b(dt) 64.61488 67.3692 76.68925 68.42335 72.36862 113.1407 10
f2(dt) 205.67688 208.6491 229.65610 213.59476 249.16703 278.7713 10
> microbenchmark(f1a(dt),
+ f1b(dt),
+ f2(dt),
+ times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1a(dt) 620.11090 624.33587 645.0220 642.13648 657.74347 697.27674 10
f1b(dt) 64.80214 67.43851 67.9140 67.99647 68.63552 69.74466 10
f2(dt) 198.39200 199.56088 209.6908 204.60183 216.23255 241.76792 10
> microbenchmark(f1a(dt),
+ f1b(dt),
+ f2(dt),
+ times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1a(dt) 619.2903 645.22617 656.58883 660.99508 664.82678 682.7618 10
f1b(dt) 63.2454 67.31781 72.10255 68.19679 71.91441 106.7493 10
f2(dt) 195.9335 210.06171 222.19868 215.75979 241.74100 245.9022 10
答案 2 :(得分:3)
可能瓶颈在于max(*),通过计算。如果是这样的话:
<div id="container">
<div class="header">THIS IS THE HEADER</div>
<div class="section"></div>
<div class="section"></div>
<div class="section"></div>
</div>