dplyr折叠时间段

时间:2017-10-25 19:02:03

标签: r dplyr lubridate data-cleaning data-munging

我在下面有一个data.frame,我希望“chunk”这些时间段,以便每个company_id将时间段“折叠”为相隔30天的时间段。

test <- blocks %>%
  filter(company_id %in% c(209952, 2802315)) %>%
  arrange(company_id, startDate) %>%
  group_by(company_id) %>%
  mutate(
    week = cumsum(startDate - lag(endDate, default = 0) > 30)
  ) %>%
  group_by(company_id, week) %>%
  summarize(
    startDate = min(startDate),
    endDate = max(endDate)
  )

我试过以下内容:

  company_id  week  startDate    endDate
       <dbl> <int>     <date>     <date>
1     209952     1 2012-09-17 2016-05-07
2     209952     2 2016-05-07 2017-10-23
3    2802315     1 2012-10-19 2014-05-18
4    2802315     2 2014-09-29 2014-11-29
5    2802315     3 2015-04-22 2015-09-23
6    2802315     4 2015-11-23 2016-05-23

问题是第(1)和第(2)行的间隔应合并为一,因此startDate = 2012-09-17和endDate = 2017-10-23,因为它们之间的间隔不到30天。

1     209952     1 2012-09-17 2016-10-23
2    2802315     1 2012-10-19 2014-05-18
3    2802315     2 2014-09-29 2014-11-29
4    2802315     3 2015-04-22 2015-09-23
5    2802315     4 2015-11-23 2016-05-23

我正在寻找的输出是

{{1}}

1 个答案:

答案 0 :(得分:2)

如何两次致电mutate + summarize

chunk = function(DF){
  DF %>%
    mutate(
      week = cumsum(startDate - lag(endDate, default = 0) > 30)
    ) %>%
    group_by(company_id, week) %>%
    summarize(
      startDate = min(startDate),
      endDate = max(endDate)
    )
}

blocks %>%
  arrange(company_id, startDate) %>%
  group_by(company_id) %>%
  chunk() %>%
  chunk()

<强>结果:

# A tibble: 5 x 4
# Groups:   company_id [?]
  company_id  week  startDate    endDate
       <int> <int>     <date>     <date>
1     209952     1 2012-09-17 2017-10-23
2    2802315     1 2012-10-19 2014-05-18
3    2802315     2 2014-09-29 2014-11-29
4    2802315     3 2015-04-22 2015-09-23
5    2802315     4 2015-11-23 2016-05-23

数据:

blocks = structure(list(company_id = c(209952L, 209952L, 209952L, 209952L, 
209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 
209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 
209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 
209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 
209952L, 209952L, 209952L, 209952L, 209952L, 209952L, 2802315L, 
2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 
2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 
2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 2802315L, 
2802315L, 2802315L, 2802315L, 2802315L), startDate = structure(c(15600, 
15630, 15661, 15691, 15722, 15753, 15781, 15812, 15842, 15873, 
15903, 15934, 15965, 15995, 16026, 16056, 16087, 16113, 16141, 
16172, 16202, 16233, 16263, 16294, 16325, 16355, 16386, 16416, 
16447, 16478, 16506, 16538, 16562, 16562, 16593, 16623, 16654, 
16928, 15632, 15663, 15693, 15724, 15755, 15783, 15814, 15844, 
15875, 15905, 15936, 15967, 15997, 16027, 16057, 16088, 16119, 
16147, 16178, 16342, 16372, 16547, 16576, 16609, 16639, 16762
), class = "Date"), endDate = structure(c(15630, 15661, 15691, 
15722, 15753, 15781, 15812, 15842, 15873, 15903, 15934, 15965, 
15995, 16026, 16056, 16087, 16118, 16141, 16172, 16202, 16233, 
16263, 16294, 16325, 16355, 16386, 16416, 16447, 16478, 16506, 
16537, 16568, 16928, 16593, 16623, 16654, 16685, 17462, 15663, 
15693, 15724, 15755, 15783, 15814, 15844, 15875, 15905, 15936, 
15967, 15997, 16028, 16057, 16088, 16119, 16147, 16178, 16208, 
16372, 16403, 16577, 16607, 16701, 16670, 16944), class = "Date")), class = "data.frame", .Names = c("company_id", 
"startDate", "endDate"), row.names = c(NA, -64L))

library(lubridate)
blocks = blocks %>%
  mutate_if(is.character, ymd)