我有一个包含项目的数据框,并且每个项目都有可用的开始日期和结束日期。我想知道在一段时间内每天有多少项是活跃的。
示例数据集:
ItemId <- c(1,2,3)
StartDate <- c(ymd("2014-01-01"),ymd("2014-02-01"),ymd("2014-03-01"))
EndDate <- c(ymd("2014-02-15"),ymd("2014-02-07"),ymd("2014-03-03"))
data.frame(ItemId,StartDate,EndDate)
ItemId StartDate EndDate
1 1 2014-01-01 01:00:00 2014-02-15 01:00:00
2 2 2014-02-01 01:00:00 2014-02-07 01:00:00
3 3 2014-03-01 01:00:00 2014-03-03 01:00:00
结果看起来像这样(每天一个条目):
Date ActiveCount
2014-01-01 1
2014-01-02 1
...
2014-02-01 2
...
我有一个使用sqldf的解决方案,但不知道如何在R中执行此操作。
select d.date
, ( select count(ItemID)
from items
where startdate <= d.date
and enddate >= d.date
) activecount
from (select distinct startdate from items
union
select distinct enddate from items
) d
order by 1
(我每天都包含多个条目,所以对于R中的sqlite,这是有效的。在postgresql上,我可以生成一系列更好的日期。)
感谢。
答案 0 :(得分:5)
调用您的数据df
:
dates = seq(min(df$StartDate), max(df$EndDate), by = "day")
counts = data.frame(date = dates,
count = sapply(dates, function(x) sum(x <= df$EndDate & x >= df$StartDate)))
答案 1 :(得分:3)
每当R任务类似于SQL任务时,可能需要将dplyr
带出柜子:
library(dplyr)
ItemId <- c(1,2,3)
StartDate <- c(ymd("2014-01-01"),ymd("2014-02-01"),ymd("2014-03-01"))
EndDate <- c(ymd("2014-02-15"),ymd("2014-02-07"),ymd("2014-03-03"))
jim <- data.frame(ItemId,StartDate,EndDate)
# One technique that's often useful especially in R, is to take your
# iterator, and define it as a variable. You can use that to implement
# a vectorised version of whatever you were thinking of doing.*/
ed <- data.frame(rng = seq(min(jim$StartDate), max(jim$EndDate), by = 'day'))
merge(jim, ed, all=TRUE) %>%
filter(rng >= StartDate, rng <= EndDate) %>%
group_by(rng) %>%
summarise(n())
这会给你:
rng n()
1 2014-01-01 1
2 2014-01-02 1
3 2014-01-03 1
...
答案 2 :(得分:0)
您首先想要获得至少一个有效项目的所有日期,然后您想要计算每天活动项目的数量。如果我们将数据存储在itemDates
中,那么这应该照顾它:
dates <- min(itemDates$StartDate) + days(0:as.numeric(max(itemDates$EndDate) - min(itemDates$StartDate)))
dateCounts <- data.frame(
row.names=dates,
counts=sapply(dates, function(date)
sum(date >= itemDates$StartDate & date <= itemDates$EndDate)))
答案 3 :(得分:0)
我已经多次回到这个问题,并且一直在寻找最有效的方法。
我以前使用过map-reduce方法,但是注意到它不能很好地缩放到具有宽日期间隔的大型数据框。我刚刚尝试使用interval
包中的lubridate
类,发现它是迄今为止最快的实现。
这是最终代码:
library(tidyverse)
library(lubridate)
# Initialize a dataframe with start and end "active" dates per object
N = 1000
id_dates = tibble(id = 1 : N) %>%
mutate(
start = sample(seq(as.Date('2018-1-1'), as.Date('2019-1-1'), by = "day"), size = N, replace = TRUE),
end = start + sample(7 : 100, size = N, replace = TRUE),
interval = interval(start, end))
# Use the %within% command to calculate the number of active items per date
queue_history = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day")) %>%
rowwise() %>%
mutate(numInWIP = sum(Date %within% id_dates$interval)) %>%
ungroup()
这里有一些基准测试表明润滑解决方案远比当前答案和map-reduce方法快
library(tidyverse)
library(lubridate)
# Initialize a dataframe with start and end "active" dates per object
N = 1000
id_dates = tibble(id = 1 : N) %>%
mutate(
start = sample(seq(as.Date('2018-1-1'), as.Date('2019-1-1'), by = "day"), size = N, replace = TRUE),
end = start + sample(7 : 100, size = N, replace = TRUE),
interval = interval(start, end))
# a map-reduce solution
method_mapreduce = function() {
queue_history = as.tibble(table(reduce(map2(id_dates$start, id_dates$end, seq, by = 1), c)))
queue_history = queue_history %>%
rename(Date = Var1, numInWIP = Freq) %>%
mutate(Date = as_date(Date))
return (queue_history)
}
# a lubridate interval solution
method_intervals = function() {
date_df = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day"))
queue_history = date_df %>%
rowwise() %>%
mutate(numInWIP = sum(Date %within% id_dates$interval))
return (queue_history)
}
# current best answer
method_currentsolution = function() {
date_df = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day"))
queue_history = merge(id_dates, date_df, all=TRUE) %>%
filter(Date >= start, Date <= end) %>%
group_by(Date) %>%
summarise(n())
}
# Compare with benchmarks
tst = microbenchmark::microbenchmark(
method_mapreduce(),
method_intervals(),
method_currentsolution(),
times = 5)
microbenchmark::autoplot.microbenchmark(tst) +
scale_y_log10(
name = sprintf("Time [%s]", attr(summary(tst), "unit")),
breaks = scales::trans_breaks("log10", function(x) round(10^x)))