Question

我正在尝试计算行数，直到在分组数据框中达到条件。我试图调整解决方案here，但这似乎不适用于群组。

示例数据：

grp <- c(rep(1:2, each = 5), 3)
fromdate <- as.Date(c("2010-06-01", "2012-02-01", "2013-02-01", "2013-02-01", "2015-10-01", "2011-02-01", "2011-03-01", "2013-04-01", "2013-06-01", "2013-10-01", "2012-02-01"), origin = "1970-01-01")
todate <- as.Date(c("2016-12-31", "2013-01-31", "2015-10-31", "2015-12-31", "2016-01-31", "2013-02-28", "2013-02-28", "2013-09-30", "2016-12-31", "2017-01-31", "2014-01-31"), origin = "1970-01-01")
df <- data.frame(grp, fromdate, todate)

我的最终目标是每组连续覆盖期限为一行。为此，我需要执行以下操作： 1）识别日期完全在前一行日期内的行（即，fromdate较大且todate较小）。然后我会删除这些日期飞行物。 2）识别当前行的fromdate何时小于前一行的todate，即覆盖重叠。然后，我会将第一行的todate重写为连续覆盖期间的最新修改，并删除其他行。

我有代码要做2）但我正在努力解决第1部分。

到目前为止，我的方法是按日期排序并向下搜索，直到达到更大的数据。这将是所需的输出：

grp   fromdate     todate      drop
 1    2010-06-01   2016-12-31  0
 1    2012-02-01   2013-01-31  1
 1    2013-02-01   2015-10-31  1
 1    2013-02-01   2015-12-31  1
 1    2015-10-01   2016-01-31  1
 2    2011-02-01   2013-02-28  0
 2    2011-03-01   2013-02-28  1
 2    2013-04-01   2013-09-30  0
 2    2013-06-01   2016-12-31  0
 2    2013-10-01   2017-01-31  0
 3    2012-02-01   2014-01-31  0

在应用第2部分之后，最终的df应该是这样的：

grp   fromdate     todate    
 1    2010-06-01   2016-12-31
 2    2011-02-01   2013-02-28
 2    2013-04-01   2017-01-31
 3    2012-02-01   2014-01-31

这可以计算直到更大日期的行数，但仅限于未分组数据：

df <- df %>%
arrange(grp, fromdate, todate) %>%
mutate(rows_to_max = sapply(1:length(todate), 
      function(x) min(which(.$todate[x:length(.$todate)] > .$todate[x]))-1)) %>%
ungroup()

我希望保持解决方案与dplyr兼容，但我愿意接受其他选择。

提前致谢。

Answer 1

假设您要删除任何前一个时间间隔中包含的时间间隔，lubridate是您的朋友：

library(lubridate)
df$int <- interval(df$fromdate, df$todate)
drop <- sapply(2:nrow(df),  function(x) {
    any(df$int[x] %within% df$int[1:(x-1)])
})
df$drop <- c(FALSE, drop)

这还没有解决你需要按组进行的事情。以下应该可以工作，但不会：

df %>% 
    group_by(grp) %>% 
    mutate(
      drop = c(FALSE, sapply(2:n(), function(x) any(int[x] %within% int[1:(x-1)])))
    )

为什么不呢？我不确定但是有一些非常可怕的错误：

 tmp <- df %>% filter(grp==2)
 tmp

#    grp   fromdate     todate                            int
#  1   2 2011-02-01 2013-02-28 2010-06-01 UTC--2012-06-28 UTC
#  2   2 2011-03-01 2013-02-28 2012-02-01 UTC--2014-01-31 UTC   <<-  WTF???
#  3   2 2013-04-01 2013-09-30 2013-02-01 UTC--2013-08-02 UTC
#  4   2 2013-06-01 2016-12-31 2013-02-01 UTC--2016-09-02 UTC
#  5   2 2013-10-01 2017-01-31 2015-10-01 UTC--2019-01-31 UTC

因此，我们将避免混合间隔和分组数据帧。惩罚是一些丑陋的多方括号：

ivls <- interval(df$fromdate, df$todate)

df$idx <- 1:nrow(df)

df %>% 
  group_by(grp) %>% 
  mutate(
    drop = c(FALSE, sapply(2:n(), function(x) any(ivls[ idx[x] ] %within% ivls[ idx[1]:idx[x-1] ])))
  )

df

# Source: local data frame [10 x 5]
# Groups: grp [2]
# 
#      grp   fromdate     todate   idx  drop
#    <int>     <date>     <date> <int> <lgl>
# 1      1 2010-06-01 2016-12-31     1 FALSE
# 2      1 2012-02-01 2013-01-31     2  TRUE
# 3      1 2013-02-01 2015-10-31     3  TRUE
# 4      1 2013-02-01 2015-12-31     4  TRUE
# 5      1 2015-10-01 2016-01-31     5  TRUE
# 6      2 2011-02-01 2013-02-28     6 FALSE
# 7      2 2011-03-01 2013-02-28     7  TRUE
# 8      2 2013-04-01 2013-09-30     8 FALSE
# 9      2 2013-06-01 2016-12-31     9 FALSE
# 10     2 2013-10-01 2017-01-31    10 FALSE

Answer 2

使用data.table::foverlap匹配行，然后迭代折叠它们。

grp <- rep(1:2, each = 5)
fromdate <- as.Date(c("2010-06-01", "2012-02-01", "2013-02-01", "2013-02-01", "2015-10-01", "2011-02-01", "2011-03-01", "2013-04-01", "2013-06-01", "2013-10-01"), origin = "1970-01-01")
todate <- as.Date(c("2016-12-31", "2013-01-31", "2015-10-31", "2015-12-31", "2016-01-31", "2013-02-28", "2013-02-28", "2013-09-30", "2016-12-31", "2017-01-31"), origin = "1970-01-01")
df <- data.frame(grp, fromdate, todate)

require(data.table)
setDT(df)
checklength <- 0

while (checklength != dim(df)[1]){

  # set our row count
  checklength <- dim(df)[1]

  # use data.table's foverlaps to match up rows
  setkey(df, grp, fromdate, todate)
  df <- foverlaps(df, df, mult = 'first')

  # collapse rows that have matched
  df[, todate   := pmax(todate, i.todate)]
  df[, fromdate := pmin(fromdate, i.fromdate)]
  df[, todate   := max(todate), .(grp, fromdate)]
  df[, fromdate := min(fromdate), .(grp, todate)]
  df <- unique(df[, .(grp, fromdate, todate)])
}

我无法想出一种摆脱这种迭代本质的方法。

Answer 3

这是我尝试解决此问题的另一种方式：

repeat {
  dfsize <-  nrow(df)
  df <- df%>%
    group_by(grp) %>%
    mutate(drop = ifelse((fromdate > lag(fromdate, 1) &
                            todate <= lag(todate, 1)) &
                           !is.na(lag(fromdate, 1)) &
                           !is.na(lag(todate, 1)),
                         1,
                         0
    )) %>%
    ungroup() %>%
    filter(drop == 0)
  dfsize2 <- nrow(df)
  if (dfsize2 == dfsize) {
    break
  }
}

它可以有效地处理我的数据子集（至少最多约100,000行和38,000个组）。然而，当我尝试在1.5米行和655,000组上运行它时，它看起来永远突然（直到我中止）。我最终手动重复mutate语句大约20次。

这只是数据问题的一个大小，还是有更有效的方法来解决问题？

R计数行，直到组达到条件

3 个答案: