我有一个包含两列的数据框-一个是开始日期,另一个是结束日期。我需要获取一个包含两列的新数据框-Date列和一列,该列是从第一个数据框开始的观察计数,其中日期在Start和End之间。
拥有:
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
Start End
2017-01-01 2017-01-03
2017-01-02 2017-01-04
2017-01-03 2017-01-05
2017-01-04 2017-01-06
2017-01-05 2017-01-07
想要:
Date Count
2017-01-01 1
2017-01-02 2
2017-01-03 3
2017-01-04 3
2017-01-05 3
2017-01-06 2
2017-01-07 1
我可以使用for循环,但是在R中有更好的方法吗?
答案 0 :(得分:3)
有可能使用data.table
:
library(data.table)
setDT(dates)[ ,.(Date = seq(Start, End, by = "day")),
by = 1:nrow(dates)][,
.(count = .N), by = Date]
#> Date count
#> 1: 2017-01-01 1
#> 2: 2017-01-02 2
#> 3: 2017-01-03 3
#> 4: 2017-01-04 3
#> 5: 2017-01-05 3
#> 6: 2017-01-06 2
#> 7: 2017-01-07 1
答案 1 :(得分:2)
一种选择是使用map2
获取相应的“开始”,“结束”列,unnest
list
输出之间的“日期”序列并获取{{ 1}}
count
或library(dplyr)
library(tidyr)
library(purrr)
dates %>%
transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
unnest(Date) %>%
count(Date)
# A tibble: 7 x 2
# Date n
# <date> <int>
#1 2017-01-01 1
#2 2017-01-02 2
#3 2017-01-03 3
#4 2017-01-04 3
#5 2017-01-05 3
#6 2017-01-06 2
#7 2017-01-07 1
base R
答案 2 :(得分:2)
使用基数R,我们可以在Start
和End
日期之间创建一个序列,并使用table
计算所有日期的频率。
stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
# values ind
#1 1 2017-01-01
#2 2 2017-01-02
#3 3 2017-01-03
#4 3 2017-01-04
#5 3 2017-01-05
#6 2 2017-01-06
#7 1 2017-01-07
答案 3 :(得分:2)
这些基本选项可避免分组。
第一个选项使用逻辑来避免Map
或重塑。是:
dates$Start
选项dates$End
合并(即,如果是同一日期,我就不应该重复计算)。dates$Start
加seq_len
组合起来,但两者之间的差又不是0。date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
第二个选项Vectorize()
是seq.Date()
函数。然后,只需将结果组合即可。
#or
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1)))
Var1 Freq
1 2017-01-01 1
2 2017-01-02 2
3 2017-01-03 3
4 2017-01-04 3
5 2017-01-05 3
6 2017-01-06 2
7 2017-01-07 1
性能:
对于@akrun的基本选项,我将table()
的结果包装在data.frame()
中,因此每个人都在生成某种data.frame
。
# The original data set copied to make 1080 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 1629.3048 1647.52845 1680.82496 1664.07245 1697.4511 1828.4093 20
tmfmnk_lubrid 6882.1404 6959.20810 7018.69083 7002.27455 7064.0898 7276.2349 20
M_M_data.tab 103.4166 103.99925 108.33817 107.95715 108.6591 134.9388 20
Ronak_stack_base 131.0364 134.23370 137.46651 137.32235 141.4388 144.5971 20
akrun_purrr 133.4917 136.89080 138.92787 138.44575 140.7778 147.4172 20
akrun_base 130.4179 134.16735 137.21640 136.98225 140.1182 145.8873 20
cole_base 15.4396 15.60345 16.42558 16.74245 17.1322 17.4201 20
cole_vec_seq 114.7890 118.44795 130.69493 121.76430 124.0880 309.7418 20
cole_dt_version 15.9107 15.98035 16.56220 16.11790 17.0216 18.8438 20
#Original data set copied to make 180 rows:
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 275.6845 279.06390 281.871350 281.95420 284.16025 287.5412 20
tmfmnk_lubrid 1136.1109 1161.35915 1176.073630 1169.81685 1176.87130 1277.6444 20
M_M_data.tab 19.0258 19.33070 19.766890 19.45450 19.63410 24.7390 20
Ronak_stack_base 22.2327 22.56530 23.234095 22.85260 23.20790 27.0589 20
akrun_purrr 27.8797 28.50225 29.146325 28.71840 29.11915 33.3277 20
akrun_base 22.3477 22.61135 23.370780 22.81920 23.41300 28.6941 20
cole_base 3.4258 3.50735 3.642605 3.62470 3.67595 3.9780 20
cole_vec_seq 19.9366 20.08345 21.359275 20.17250 22.48055 25.7780 20
cole_dt_version 3.9992 4.09905 4.207690 4.16135 4.28265 4.5052 20
# Original dataset copied to make 30 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 51.2437 52.16495 54.524465 52.55520 56.19050 66.9461 20
tmfmnk_lubrid 192.1206 196.99550 198.501640 197.64815 201.42050 203.7031 20
M_M_data.tab 4.9511 5.05215 5.215670 5.19315 5.33075 5.7740 20
Ronak_stack_base 4.3609 4.51110 4.995405 4.54885 4.79490 8.8183 20
akrun_purrr 10.9024 10.96420 11.622235 11.07575 11.58300 15.7751 20
akrun_base 4.4919 4.55905 4.843730 4.60825 4.73760 8.4334 20
cole_base 1.4225 1.48635 1.738995 1.58685 1.60780 5.2324 20
cole_vec_seq 4.0648 4.16095 4.318665 4.24445 4.48420 4.7344 20
cole_dt_version 1.9733 2.06385 2.132040 2.13965 2.18945 2.3612 20
#Original 5 row dataset
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 14.7549 14.90780 15.463195 15.10195 15.62030 18.9115 20
tmfmnk_lubrid 37.2571 37.58240 41.583090 38.18540 40.57435 86.6058 20
M_M_data.tab 2.6235 2.85145 3.037975 2.90815 2.97045 5.3476 20
Ronak_stack_base 1.3305 1.38490 1.465170 1.49175 1.53355 1.5978 20
akrun_purrr 7.7036 7.86260 8.212875 7.98790 8.18055 11.7898 20
akrun_base 1.4046 1.43715 1.501945 1.51890 1.56545 1.6176 20
cole_base 1.0560 1.09905 1.169260 1.16010 1.21595 1.3601 20
cole_vec_seq 1.3547 1.40685 1.452515 1.45645 1.51385 1.5328 20
cole_dt_version 1.5662 1.70555 1.813365 1.78930 1.84720 2.5267 20
参考代码:
library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(microbenchmark)
library(lubridate)
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
dates_dt <- as.data.table(dates)
dates <- rbind(dates,dates,dates,dates,dates,dates) #repeat this as many times as you want
dates_dt <- as.data.table(dates)
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
microbenchmark(
tmfmnk_complete = {
dates %>%
rowid_to_column() %>%
gather(var, Date, -rowid) %>%
group_by(rowid) %>%
complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
ungroup() %>%
count(Date)
}
, tmfmnk_lubrid = {
dates %>%
rowwise() %>%
mutate(Date = interval(Start, End)/days(1),
Date = list(Start + days(0:Date))) %>%
ungroup() %>%
unnest() %>%
count(Date)
}
, M_M_data.tab = {
dates_dt[ ,.(Date = seq(Start, End, by = "day")),
by = 1:nrow(dates_dt)][,
.(count = .N), by = Date]
}
, Ronak_stack_base = {
stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
}
, akrun_purrr = {
dates %>%
transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
unnest(Date) %>%
count(Date)
}
, akrun_base = {
lst1 <- do.call(Map, c(f = seq, unname(dates), by = "1 day"))
data.frame(table(do.call(c, lst1)))
}
, cole_base = {
date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
}
, cole_vec_seq = {
data.frame(table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1))))
}
, cole_dt_version = {
date_diffs <- dates$End - dates$Start
dates_dt[date_diffs > 0, data.frame(table({diff_sub = End - Start -1; c(dates_dt[['Start']], End, rep(Start, diff_sub) + sequence(diff_sub))}))]
}
, times = 20
)
答案 4 :(得分:1)
涉及dplyr
和tidyr
的可能性可能是:
dates %>%
rowid_to_column() %>%
gather(var, Date, -rowid) %>%
group_by(rowid) %>%
complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
ungroup() %>%
count(Date)
Date n
<date> <int>
1 2017-01-01 1
2 2017-01-02 2
3 2017-01-03 3
4 2017-01-04 3
5 2017-01-05 3
6 2017-01-06 2
7 2017-01-07 1
或添加lubridate
:
dates %>%
rowwise() %>%
mutate(Date = interval(Start, End)/days(1),
Date = list(Start + days(0:Date))) %>%
ungroup() %>%
unnest() %>%
count(Date)