我在RStudio中加载了一个数据框,其中包含有关众多事件的信息(数百万)。
每一行都是单个事件的一个条目,除了其他信息外,它还包含两个带有日期信息的属性。第一个包含事件开始的日期和结束时的第二个日期。但事件不是连续的,所以它们可能会在时间上重叠。
fecha fecha_fin
7510607 2014-02-13 20:09:59.8270000 2014-02-27 09:55:40.9700000
7510608 2014-02-13 20:10:01.1870000 2014-02-27 09:55:42.5630000
7557931 2014-02-16 05:32:08.6230000 2014-02-16 14:03:19.4970000
找出哪些日历日没有活动(没有任何正在处理的事件)的最佳和最有效的选项是什么?请记住,必须考虑事件的持续时间。
答案 0 :(得分:3)
对于此类情况,我倾向于使用foverlaps
包中的data.table
,例如:
library(data.table)
dt <- fread("id,fecha,fecha_fin
7510607,2014-02-01 20:09:59.8270000,2014-02-10 09:55:40.9700000
7510607,2014-02-13 20:09:59.8270000,2014-02-27 09:55:40.9700000
7510608,2014-02-13 20:10:01.1870000,2014-02-27 09:55:42.5630000
7557931,2014-02-16 05:32:08.6230000,2014-02-16 14:03:19.4970000")
setkey(dt, fecha, fecha_fin)
set(dt, j = 1L, value = NULL)
dt <- dt[,lapply(.SD, as.POSIXct, tz = "CET"),.SDcols=1:2]
dt2 <- data.table(fecha=as.POSIXct(seq(min(as.Date(dt$fecha)), max(as.Date(dt$fecha_fin)), "1 day")))[,fecha_fin:=fecha+60*60*24-1]
as.Date(foverlaps(dt2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])
# [1] "2014-02-11" "2014-02-12"
答案 1 :(得分:1)
更新,稍微修改一下lukeA的代码:
我希望我的基准测试没有任何问题...
library(data.table)
library(lubridate)
library(microbenchmark)
# Create dt ---------------------------------------------------------------
size = 99999
# With this size result is an empty set, check smaller sizes like 999 to confirm
# results are same for both functions
create_dt <- function() {
set.seed(2016)
dt <- data.table(
ID = 1:size,
fecha = sample(
seq(ymd('2000/01/01'), ymd('2016/11/16'), by="day"),
size, replace = TRUE)
)
dt[, fecha_fin := fecha + sample(1:3, size, replace = TRUE)]
setkey(dt, fecha, fecha_fin)
set(dt, j = 1L, value = NULL)
dt <- dt[,lapply(.SD, as.POSIXct, tz = "CET"),.SDcols=1:2]
}
dt <- create_dt()
# Declare functions -------------------------------------------------------
f_mdz <- function() {
dt_2 <- data.table(
fecha = seq(min(dt$fecha), max(dt$fecha_fin), by = '1 day')
# Function simplified here!!!
)[, fecha_fin := fecha]
# ---------------------------
as.Date(
foverlaps(dt_2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])#,
# origin = '1970-01-01')
}
f_lukeA <- function() {
dt2 <- data.table(
fecha = seq(min(dt$fecha), max(dt$fecha_fin), "1 day")
)[,fecha_fin:=fecha+60*60*24-1]
as.Date(
foverlaps(dt2, dt)[is.na(fecha) & is.na(fecha_fin),i.fecha])
}
# Benchmark! --------------------------------------------------------------
microbenchmark(
dt_mdz <- f_mdz(),
dt_lukeA <- f_lukeA(),
times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dt_mdz <- f_mdz() 46.96793 55.11631 95.59214 60.33659 191.5536 212.4523 100 a
# dt_lukeA <- f_lukeA() 50.57496 56.42464 105.07356 60.81974 194.0779 211.8037 100 a
identical(dt_mdz, dt_lukeA)
旧答案:
进一步调查的出发点(远非效率,例如data.table
上的逐行操作......)可能是:
library(data.table)
library(lubridate)
library(magrittr)
dt <- data.table(
ID = c(7510607L, 7510608L, 7557931L),
fecha = ymd(c('2014-02-15', '2014-02-16', '2014-02-11')),
fecha_fin = ymd(c('2014-02-27', '2014-02-27', '2014-02-12'))
)
# ID fecha fecha_fin
# 1: 7510607 2014-02-15 2014-02-27
# 2: 7510608 2014-02-16 2014-02-27
# 3: 7557931 2014-02-11 2014-02-12
# Make the data "long"
long_dt <- dt[, .(days = seq(fecha, fecha_fin, by = '1 day')), by = ID]
# Get the diff with days sequence from min to max date
setdiff(
seq(long_dt[, min(days)], long_dt[, max(days)], by = '1 day'),
long_dt[, sort(unique(days))]
) %>% as.Date(origin = '1970-01-01')
# [1] "2014-02-13" "2014-02-14"
请注意我已将您的数据更改为实际有两天(2014-02-13和2014-02-14),没有任何活动。
答案 2 :(得分:0)
基础R解决方案是:
df$fecha <- strptime(df$fecha, "%Y-%m-%d")
df$fecha_fin <- strptime(df$fecha_fin, "%Y-%m-%d")
dates_list <- lapply(1:3, function(x){
interval_events <- seq(from = df$fecha[x], to = df$fecha_fin[x], by = "days")
})
interval_events <- unique(do.call("c", dates_list))
interval_complete <- seq(from = min(df$fecha), max(df$fecha_fin), by = "days")
interval_complete[!(interval_complete %in% interval_events)]
#[1] "2014-02-13 CET" "2014-02-14 CET"
答案 3 :(得分:0)
这是一个简单的!您只需展开日期并结合所有日期。
## Data
dt1=as.Date(c('2014/01/01','2014/01/08','2014/01/05'))
dt2=as.Date(c('2014/01/10','2014/01/14','2014/01/05'))
df=data.frame(id=sample(1:3), dt1=dt1, dt2=dt2)
## Code
date=apply(df, 1, function(x) seq(as.Date(x[2]), as.Date(x[3]), by="day"))
event_dates=as.Date(Reduce(union, date), origin = "1970-01-01")