重叠的日期间隔与差距

时间:2016-03-17 23:10:12

标签: r data.table

我一直试图找到一个优雅的data.table解决方案(但如果有更好的东西,我将采取另一种解决方案)如何组合处理(1)嵌套间隔,(2)具有不同优先级的间隔的日期间隔, (3)带间隙的间隔(参见示例输入对象)。我设法通过foverlaps和shift找到了一个近乎成功的解决方案。我希望避免使用/ while循环或笛卡尔产品,这些产品既低效又不优雅,但肯定会起作用。我希望有更好的东西,因为这是我必须处理的常见数据问题。根据要求,如果有用的话,我会展示我几乎有效的解决方案。

require(data.table)
# my data looks somewhat like this...
input <- data.table(
  person_ID = c(rep(98723, 4), rep(8534, 2), 11223, rep(22446, 2)),
  team = c(rep("A", 4), rep("B", 2), "A", "B", "A"),
  start_date = as.Date(c("2009-10-1", "2011-11-21", "2012-1-23", "2013-3-2", 
    "2009-11-14", "2010-1-1", "2012-1-2", "2011-2-2", "2012-4-3")),
  end_date = as.Date(c("2010-5-23", NA, "2015-01-02", "2013-3-2", "2009-12-31", 
    "2010-3-1", "2015-03-22", "2016-1-2", "2014-9-30")))
team_priority <- data.table(team = c("A", "B"), priority = c(1, 2))
input[team_priority, priority := i.priority, on = "team"]

输入

  person_ID team start_date   end_date priority
1:     98723    A 2009-10-01 2010-05-23        1
2:     98723    A 2011-11-21       <NA>        1
3:     98723    A 2012-01-23 2015-01-02        1
4:     98723    A 2013-03-02 2013-03-02        1
5:      8534    B 2009-11-14 2009-12-31        2
6:      8534    B 2010-01-01 2010-03-01        2
7:     11223    A 2012-01-02 2015-03-22        1
8:     22446    B 2011-02-02 2016-01-02        2
9:     22446    A 2012-04-03 2014-09-30        1
# problem 1: gap in teams prevents simple min/max solution (see person_ID == 98723)

# problem 2: teams have priorities, so if team B is inside of time interval assigned to time A,
# then we need the records to reflect the following:
# team B -> team A -> team B based on when teams A & B start/stop (see person_ID == 22446)

# NOTE: problem 1 and 2 can be combined (I am trying to fix bad data entry)

# I have to assign priorities based on team involvement (A > B > C, etc)
output <- data.table(
  person_ID = c(rep(98723, 4), rep(8534, 2), 11223, rep(22446, 3)),
  team = c(rep("A", 4), rep("B", 2), "A", "B", "A", "B"),
  start_date = as.Date(c("2009-10-1", "2011-11-21", "2012-1-23", "2013-3-2", 
    "2009-11-14", "2010-1-1", "2012-1-2", "2011-2-2",  "2012-4-3", "2014-10-1")),
  end_date = as.Date(c("2010-5-23", NA, "2015-01-02", "2013-3-2", "2009-12-31", 
    "2010-3-1", "2015-03-22", "2012-4-2", "2014-9-30", "2016-1-2")),
  group_id = c(1, rep(2, 3), rep(4, 2), 5, 6, 7, 8))

输出

    person_ID team start_date   end_date group_id
 1:     98723    A 2009-10-01 2010-05-23        1
 2:     98723    A 2011-11-21       <NA>        2
 3:     98723    A 2012-01-23 2015-01-02        2
 4:     98723    A 2013-03-02 2013-03-02        2
 5:      8534    B 2009-11-14 2009-12-31        4
 6:      8534    B 2010-01-01 2010-03-01        4
 7:     11223    A 2012-01-02 2015-03-22        5
 8:     22446    B 2011-02-02 2012-04-02        6
 9:     22446    A 2012-04-03 2014-09-30        7
10:     22446    B 2014-10-01 2016-01-02        8

简单的MIN / MAX解决方案无效!我不反对单独处理嵌套间隔(参见person_id == 22446),但这确实是我无法弄清楚的部分,同时仍然可以同时满足所有其他目标。

1 个答案:

答案 0 :(得分:0)

可能还有其他方法可以更有效地完成此操作。但我认为这比没有好。如果有人带来更好的东西,真棒,我会乐意选择他们的回应。也许我应该通过将其分成几部分来使这更加可口,但如果我知道哪些部分,我首先不必问。 :)无论如何,这是我的解决方案,我会更加精致,因为它会进入我的R包中,但这至少可以回答我到目前为止所测试的所有方法中的问题。

我不得不打破我的意愿,因为这部分包含了循环,其中一部分使用了笛卡尔积。我想不出一种避免它们的方法。(

overlap_combine <- 
  function(overlap_dt, id_cols, team_col, start_col, end_col, overlap_int = 1L, 
           replace_blanks = Sys.Date(), priority_col = "priority") {
    setorderv(overlap_dt, c(id_cols, team_col, start_col))
    overlap_dt[, end_col := get(end_col) + overlap_int]
    sd_cols <- c(start_col, "end_col")
    # foverlaps cannot deal with blanks
    overlap_dt[is.na(end_col), end_col := replace_blanks]
    # note: if end_col becomes < start_col due to overlap_int,
    # we assign end_col <- start_col
    overlap_dt[end_col - get(start_col) < 0, end_col := start_col]
    overlap_dt[, index := .I]
    setnames(overlap_dt, start_col, "start_date")
    # setnames(overlap_dt, team_col, "team")
    # finding overlapping combinations via vectors of indices ---
    c_overlap <- 
      overlap_dt[overlap_dt[, unique(.SD), .SDcols = 
                              c(id_cols, team_col, "start_date", "end_col", "index")], 
                 on = c(id_cols, team_col), allow.cartesian = TRUE]
    c_overlap <- c_overlap[i.index != index]
    c_overlap[between(i.start_date, start_date, end_col) |
                between(i.end_col, start_date, end_col), 
              ovr_vec := list(list(unique(c(index, i.index)))), 
              by = c(id_cols, team_col, "start_date")]
    ovr_l <- c_overlap[, ovr_vec]
    ovr_l <- Filter(Negate(function(x) is.null(unlist(x))), ovr_l)
    ovr_l <- unique(ovr_l)
    # find list of reduced vectors which we need to MIN/MAX ---
    ovr_red_l <- list()
    for (i in seq_along(ovr_l)) {
      tmp_inter <- unique(unlist(sapply(
        ovr_l,
        FUN = function(x) {
          if (length(intersect(unlist(x), unlist(ovr_l[i]))) > 0) {
            result <- union(unlist(x), unlist(ovr_l[i]))
            return(result)
          } else {
            return(ovr_l[i])
          }
        }
      )))
      ovr_red_l[[i]] <- sort(tmp_inter)
    }
    ovr_red_l <- unique(ovr_red_l)

    for (i in seq(ovr_red_l)) {
      setkey(overlap_dt, index)[ovr_red_l[[i]],
                                c("start_date", "end_date", "end_col") := 
                                  list(min(start_date), max(end_date), max(end_col))]
    }
    overlap_dt[, index := NULL]
    overlap_dt <- unique(overlap_dt)
    setkeyv(overlap_dt, c(id_cols, "start_date"))
    # figure out which intervals are nested, separate them out & deal with them
    overlap_dt[, index := .I]
    overlap_dt[, prior_shift := priority - shift(priority, n = 1), by = id_cols]
    overlap_dt[between(shift(start_date, n = 1), start_date, end_col), ovr_shift := 1, by = id_cols]
    overlap_dt[shift(ovr_shift) == 1 & prior_shift < 0, nested_ovr := 1]
    overlap_dt[index %in% overlap_dt[nested_ovr == 1, index - 1], nested_ovr := 1]
        overlap_nested_dt <- overlap_dt[nested_ovr == 1]
    overlap_dt <- overlap_dt[is.na(nested_ovr)]
    nested_melt_dt <- 
      melt(overlap_nested_dt, id.vars = c(id_cols, team_col, priority_col), 
           measure.vars = c("start_date", "end_col"), value.name = "date_value")
    setkey(nested_melt_dt, person_ID, date_value)
    nested_melt_dt[, boundaries := cumsum(ifelse(variable == "start_date", 1, -1))]
    nested_melt_dt[boundaries==1 & variable == "start_date", bound_ok := 1]
    nested_melt_dt[boundaries==0 & variable == "end_col", bound_ok := 1]
    # create new records that need to be added ---
    add_dt <- nested_melt_dt[0, ]
    if (nrow(nested_melt_dt)>0) {
      for (i in 1:nrow(nested_melt_dt))
        if (nested_melt_dt[i, is.na(bound_ok)]) {
          if (nested_melt_dt[i, variable] == "start_date") {
            add_dt <- rbindlist(list(add_dt, 
                                     data.table(nested_melt_dt[i, person_ID],
                                                nested_melt_dt[i-1, team],
                                                nested_melt_dt[i-1, priority],
                                                "end_col", 
                                                nested_melt_dt[i, date_value - 1],
                                                NA, NA)))
          } else if (nested_melt_dt[i, variable] == "end_col") {
            add_dt <- rbindlist(list(add_dt, 
                                     data.table(nested_melt_dt[i, person_ID],
                                                nested_melt_dt[i+1, team],
                                                nested_melt_dt[i+1, priority],
                                                "start_date", 
                                                nested_melt_dt[i, date_value + 1],
                                                NA, NA)))
          }
        }
    }
    # add new records back in
    nested_melt_dt <- rbindlist(list(nested_melt_dt, add_dt))
    setkeyv(nested_melt_dt, c(id_cols, "date_value"))
    nested_melt_dt[, c("bound_ok", "boundaries") := NULL]
    nested_melt_dt[, date_group := .I, by = id_cols]
    nested_melt_dt[variable == "end_col", date_group := date_group - 1L]
    nested_melt_dt <-
      dcast(nested_melt_dt, paste0(paste(id_cols, sep = "+"), "+", team_col, "+",  
                                   priority_col, "+", "date_group ~ variable"), value.var = "date_value")
    nested_melt_dt[, end_date := end_col]
    nested_melt_dt[, c("date_group") := NULL]
    overlap_dt[,  c("index", "prior_shift", "ovr_shift", "nested_ovr") := NULL]
    overlap_dt <- rbindlist(list(overlap_dt, nested_melt_dt), use.names = TRUE)
    setorderv(overlap_dt, c(id_cols, start_col, team_col))
    return(overlap_dt)
  }
overlap_combine(overlap_dt = copy(input), id_cols = "person_ID", 
                team_col = "team", start_col = "start_date", end_col = "end_date", 
                overlap_int = 1L, replace_blanks = Sys.Date() + 1e3, 
                priority_col = "priority")

功能输出

   person_ID team start_date   end_date priority    end_col
1:      8534    B 2009-11-14 2010-03-01        2 2010-03-02
2:     11223    A 2012-01-02 2015-03-22        1 2015-03-23
3:     22446    B 2011-02-02 2012-04-02        2 2012-04-02
4:     22446    A 2012-04-03 2014-10-01        1 2014-10-01
5:     22446    B 2014-10-02 2016-01-03        2 2016-01-03
6:     98723    A 2009-10-01 2010-05-23        1 2010-05-24
7:     98723    A 2011-11-21       <NA>        1 2018-12-17