我有一个包含250,000行的data.table。一小部分样本如下所示:
library('data.table')
stack <- structure(list(ID = c(1, 2, 3, 4,
5, 6),
Special = c(FALSE, FALSE, TRUE, FALSE,
FALSE, FALSE),
ENTER = structure(c(1453797143, 1453850662, 1453444763, 1453656563, 1453410022, 1453367723),
class = c("POSIXct", "POSIXt"), tzone = ""),
LEAVE = structure(c(1453803923, 1453856002, 1453450403,1453657823, 1453418123, 1453377382),
class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("ID", "Special", "ENTER", "LEAVE"),
class = c("data.table", "data.frame"),
row.names = c(NA, -6L))
stack
ID Special ENTER LEAVE
1: 1 FALSE 2016-01-26 09:32:23 2016-01-26 11:25:23
2: 2 FALSE 2016-01-27 00:24:22 2016-01-27 01:53:22
3: 3 TRUE 2016-01-22 07:39:23 2016-01-22 09:13:23
4: 4 FALSE 2016-01-24 18:29:23 2016-01-24 18:50:23
5: 5 FALSE 2016-01-21 22:00:22 2016-01-22 00:15:23
6: 6 FALSE 2016-01-21 10:15:23 2016-01-21 12:56:22
我正在以每小时的间隔寻找WIP(正在进行的工作)。这是已输入但尚未在特定时间离开的实例数的计数。我现在使用以下功能执行此操作:
get_WIPlevels <- function(data) {
# determine range
min <- round(min(data$ENTER), "hours")
max <- round(max(data$LEAVE), "hours")
range <- seq(from=min, to=max, by='hours')
# create target data.table
WIPLevels <- data.table(Timestamp = range, WIP = integer(length(range)))
# calculate WIP values
WIPLevels$WIP <- sapply(WIPLevels$Timestamp, function(x)
nrow(data[data$ENTER<=x & data$LEAVE > x,]))
return(WIPLevels)
}
这可以按预期工作,结果如下:
get_WIPLevels(stack)
Timestamp WIP
1: 2016-01-21 10:00:00 0
2: 2016-01-21 11:00:00 1
3: 2016-01-21 12:00:00 1
4: 2016-01-21 13:00:00 0
5: 2016-01-21 14:00:00 0
---
137: 2016-01-27 02:00:00 0
但是,它不是最快的功能。将它应用于完整数据集只需要一分钟:
system.time(get_WIPlevels(fulldata))
user system elapsed
53.72 2.15 56.19
有什么建议可以加快速度吗?
答案 0 :(得分:1)
您可以使用滚动连接尝试此操作
tr <- data.table(
Timestamp = seq(lubridate::floor_date(min(stack$ENTER), "hour"),
lubridate::ceiling_date(max(stack$LEAVE), "hour"), by = "hours"))
setkey(tr, Timestamp)
stack[, Timestamp := ENTER]
setkey(stack, Timestamp)
ms <- stack[tr, roll = +Inf]
ms[, .(WIP = sum(!is.na(ENTER) & LEAVE >= Timestamp)), by = Timestamp]