这是员工开始和结束日期的列表。考虑到可能有数万名员工,我们如何有效地计算每月员工人数。
begin <- as.Date(c("2015-07-06","2015-07-06","2015-07-27","2015-07-06","2015-06-29",
"2015-07-06","2015-07-06","2015-07-06","2015-07-13","2015-06-29",
"2015-07-06","2015-07-06","2015-07-13","2015-07-01","2015-07-06",
"2015-07-06","2015-07-06","2015-07-09","2015-07-13"),format = "%Y-%m-%d")
end <- as.Date(c("2018-08-03","2016-01-11","2999-12-31","2017-03-13","2999-12-31",
"2015-10-20","2999-12-31","2018-09-24","2999-12-31","2015-09-25",
"2019-11-01","2999-12-31","2018-03-26","2018-08-08","2015-10-13",
"2999-12-31","2999-12-31","2021-02-11","2999-12-31"), format = "%Y-%m-%d")
smallEmp <- data.frame(begin,end)
method_currentsolution = function() {
date_df = tibble(Date = seq(from=as.Date('2015-01-01'), to=Sys.Date(), by = "month"))
queue_history = merge(smallEmp, date_df, all=TRUE) %>%
filter(Date >= begin, Date <= end) %>%
group_by(Date) %>%
summarise(cnt = n())
}
result <- method_currentsolution()
smallEmp
是包含 Begin
和 End
日期的数据集。
此解决方案有效,但对于非常大的数据集来说速度很慢,因为它会针对每个感兴趣的月份重复所有员工。任何可以加快速度的建议将不胜感激。
答案 0 :(得分:3)
基本的 R 解决方案是
date_df <- data.frame(
Date = seq(from=as.Date('2015-01-01'), to=Sys.Date(), by = "month"))
date_df$count <- sapply(date_df$Date, function(x) sum(x >= begin & x <= end))
date_df <- subset(date_df, count > 0)
head(date_df, 10)
#R> Date count
#R> 7 2015-07-01 3
#R> 8 2015-08-01 19
#R> 9 2015-09-01 19
#R> 10 2015-10-01 18
#R> 11 2015-11-01 16
#R> 12 2015-12-01 16
#R> 13 2016-01-01 16
#R> 14 2016-02-01 15
#R> 15 2016-03-01 15
#R> 16 2016-04-01 15
这可能更快,但它使用更多的峰值内存
date_df_2 <- data.frame(
Date = seq(from=as.Date('2015-01-01'), to=Sys.Date(), by = "month"))
date_df_2$count <-
rowSums(outer(date_df_2$Date, begin, `>=`) & outer(date_df_2$Date, end, `<=`))
date_df_2 <- subset(date_df_2, count > 0)
# we got the same
all.equal(date_df_2, date_df)
#R> [1] TRUE
在 R 4.1.0 或更高版本中可以更好地编写两个解决方案,如下所示
date_df_3 <- data.frame(
Date = seq(as.Date('2015-01-01'), Sys.Date(), by = "month")) |>
transform(count = sapply(Date, \(x) sum(x >= begin & x <= end))) |>
subset(count > 0)
date_df_4 <- data.frame(
Date = seq(as.Date('2015-01-01'), Sys.Date(), by = "month")) |>
transform(
count = rowSums(outer(Date, begin, `>=`) & outer(Date, end, `<=`))) |>
subset(count > 0)
# we got the same
all.equal(date_df_3, date_df)
#R> [1] TRUE
all.equal(date_df_4, date_df)
#R> [1] TRUE
您可能希望将 >=
和 <=
之一更改为 >
或 <
。
这是一个简单的基准测试,它是通过重复 OP 提供的数据创建的
# repeat the data a number of times to create a larger data set
begin <- rep(begin, 1000)
end <- rep(end, 1000)
smallEmp <- data.frame(begin,end)
small_emp <- data.table(begin = begin, end = end)
# perform the benchmark
bench::mark(
OP = method_currentsolution(),
sapply = data.frame(
Date = seq(as.Date('2015-01-01'), Sys.Date(), by = "month")) |>
transform(count = sapply(Date, \(x) sum(x >= begin & x <= end))) |>
subset(count > 0),
rowSums = data.frame(
Date = seq(as.Date('2015-01-01'), Sys.Date(), by = "month")) |>
transform(
count = rowSums(outer(Date, begin, `>=`) & outer(Date, end, `<=`))) |>
subset(count > 0),
data.table = {
all_dates <- data.table(
date = seq(from = as.Date('2015-01-01'), to = Sys.Date(), by = "month"))
all_dates[small_emp, .N, on = .(date >= begin, date <= end), keyby = .(date)]
},
`data.table (by)` = {
all_dates <- data.table(
date = seq(from = as.Date('2015-01-01'), to = Sys.Date(), by = "month"))
all_dates[small_emp, .N, on = .(date >= begin, date <= end), by = .(date)]
},
fuzzy_inner_join = method_fuzzy(smallEmp), check = FALSE, memory = FALSE)
#R> # A tibble: 6 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#R> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#R> 1 OP 2.87s 2.87s 0.349 NA 1.74 1 5 2.87s <NULL> <NULL> <bench_tm [1]> <tibble [1 × 3]>
#R> 2 sapply 8.91ms 9.42ms 55.9 NA 7.71 29 4 518.53ms <NULL> <NULL> <bench_tm [29]> <tibble [29 × 3]>
#R> 3 rowSums 28.26ms 56.98ms 15.9 NA 24.7 9 14 566.25ms <NULL> <NULL> <bench_tm [9]> <tibble [9 × 3]>
#R> 4 data.table 23.34ms 23.89ms 25.8 NA 7.95 13 4 503.36ms <NULL> <NULL> <bench_tm [13]> <tibble [13 × 3]>
#R> 5 data.table (by) 23.22ms 28.4ms 33.9 NA 3.99 17 2 500.82ms <NULL> <NULL> <bench_tm [17]> <tibble [17 × 3]>
#R> 6 fuzzy_inner_join 16.09s 16.09s 0.0622 NA 8.21 1 132 16.09s <NULL> <NULL> <bench_tm [1]> <tibble [1 × 3]>
sapply
解决方案是最快的。它比 data.table
版本的秒快约快 2 倍。对于其他大小的数据(比如数百万个数据点,而不是 19000 个),情况可能会有所不同。
答案 1 :(得分:2)
你可以试试 fuzzjoin
-
library(dplyr)
method_fuzzy = function(df) {
date_df = tibble(Date = seq(from=as.Date('2015-01-01'),
to=Sys.Date(), by = "month"))
fuzzyjoin::fuzzy_inner_join(date_df, df,
by = c('Date' = 'begin', 'Date' = 'end'),
match_fun = c(`>=`, `<=`)) %>%
count(Date)
}
method_fuzzy(smallEmp)
答案 2 :(得分:2)
您可以使用 data.table
非常有效地执行相同的操作,使用 "non-equi join":
small_emp <- data.table(
begin = as.Date(
c(
"2015-07-06","2015-07-06","2015-07-27","2015-07-06","2015-06-29",
"2015-07-06","2015-07-06","2015-07-06","2015-07-13","2015-06-29",
"2015-07-06","2015-07-06","2015-07-13","2015-07-01","2015-07-06",
"2015-07-06","2015-07-06","2015-07-09","2015-07-13"
),
format = "%Y-%m-%d"
),
end = as.Date(
c(
"2018-08-03","2016-01-11","2999-12-31","2017-03-13","2999-12-31",
"2015-10-20","2999-12-31","2018-09-24","2999-12-31","2015-09-25",
"2019-11-01","2999-12-31","2018-03-26","2018-08-08","2015-10-13",
"2999-12-31","2999-12-31","2021-02-11","2999-12-31"
),
format = "%Y-%m-%d"
)
)
all_dates <- data.table(
date = seq(from = as.Date('2015-01-01'), to = Sys.Date(), by = "month")
)
result2 <- all_dates[
small_emp, .N, on = .(date >= begin, date <= end), keyby = .(date)
]
请注意,非对等连接的 on
规范有点棘手。它不是一个任意的 R 表达式!它是形式为 lhs_column %binop% rhs_column
的“操作”列表。具体来说,date >= begin
是有效的,因为 date
是左侧表 all_dates
中的一列,而 begin
是右侧表 small_emp
中的一列。
此外,on
中指定的列的名称将被丢弃,在这种情况下替换为 date.1
。我实际上不太了解这里的规则,而且从我所看到的情况来看,它没有得到很好的记录。也许有人可以在评论中解释这是如何工作的!
答案 3 :(得分:2)
这是一个使用 foverlaps()
的 data.table 方法。请注意,如果您将 2999 年减少到另一个(更接近)令人难以置信的数字,例如 2099,则此解决方案的执行速度会快得多。
library(data.table)
DT <- data.table(start = begin, end = end)
answer <- data.table(date = seq(min(DT$start), max(DT$end), by = "1 days"))
answer[, dummy := date]
#set keys
setkey(DT, start, end)
setkey(answer, date, dummy)
#overlap join ans summarise
final <- foverlaps(DT, answer)[, .N, by = date]
# 1: 2015-06-29 2
# 2: 2015-06-30 2
# 3: 2015-07-01 3
# 4: 2015-07-02 3
# 5: 2015-07-03 3
# ---
# 359581: 2999-12-27 8
# 359582: 2999-12-28 8
# 359583: 2999-12-29 8
# 359584: 2999-12-30 8
# 359585: 2999-12-31 8
#create nicer looking intervals
final[, .(from = min(date), to = max(date), N = min(N)), by = (id = rleid(N))]
# id from to N
# 1: 1 2015-06-29 2015-06-30 2
# 2: 2 2015-07-01 2015-07-05 3
# 3: 3 2015-07-06 2015-07-08 14
# 4: 4 2015-07-09 2015-07-12 15
# 5: 5 2015-07-13 2015-07-26 18
# 6: 6 2015-07-27 2015-09-25 19
# 7: 7 2015-09-26 2015-10-13 18
# 8: 8 2015-10-14 2015-10-20 17
# 9: 9 2015-10-21 2016-01-11 16
#10: 10 2016-01-12 2017-03-13 15
#11: 11 2017-03-14 2018-03-26 14
#12: 12 2018-03-27 2018-08-03 13
#13: 13 2018-08-04 2018-08-08 12
#14: 14 2018-08-09 2018-09-24 11
#15: 15 2018-09-25 2019-11-01 10
#16: 16 2019-11-02 2021-02-11 9
#17: 17 2021-02-12 2999-12-31 8