我一直试图找到一个优雅的data.table解决方案(但如果有更好的东西,我将采取另一种解决方案)如何组合处理(1)嵌套间隔,(2)具有不同优先级的间隔的日期间隔, (3)带间隙的间隔(参见示例输入对象)。我设法通过foverlaps和shift找到了一个近乎成功的解决方案。我希望避免使用/ while循环或笛卡尔产品,这些产品既低效又不优雅,但肯定会起作用。我希望有更好的东西,因为这是我必须处理的常见数据问题。根据要求,如果有用的话,我会展示我几乎有效的解决方案。
require(data.table)
# my data looks somewhat like this...
input <- data.table(
person_ID = c(rep(98723, 4), rep(8534, 2), 11223, rep(22446, 2)),
team = c(rep("A", 4), rep("B", 2), "A", "B", "A"),
start_date = as.Date(c("2009-10-1", "2011-11-21", "2012-1-23", "2013-3-2",
"2009-11-14", "2010-1-1", "2012-1-2", "2011-2-2", "2012-4-3")),
end_date = as.Date(c("2010-5-23", NA, "2015-01-02", "2013-3-2", "2009-12-31",
"2010-3-1", "2015-03-22", "2016-1-2", "2014-9-30")))
team_priority <- data.table(team = c("A", "B"), priority = c(1, 2))
input[team_priority, priority := i.priority, on = "team"]
输入
person_ID team start_date end_date priority
1: 98723 A 2009-10-01 2010-05-23 1
2: 98723 A 2011-11-21 <NA> 1
3: 98723 A 2012-01-23 2015-01-02 1
4: 98723 A 2013-03-02 2013-03-02 1
5: 8534 B 2009-11-14 2009-12-31 2
6: 8534 B 2010-01-01 2010-03-01 2
7: 11223 A 2012-01-02 2015-03-22 1
8: 22446 B 2011-02-02 2016-01-02 2
9: 22446 A 2012-04-03 2014-09-30 1
# problem 1: gap in teams prevents simple min/max solution (see person_ID == 98723)
# problem 2: teams have priorities, so if team B is inside of time interval assigned to time A,
# then we need the records to reflect the following:
# team B -> team A -> team B based on when teams A & B start/stop (see person_ID == 22446)
# NOTE: problem 1 and 2 can be combined (I am trying to fix bad data entry)
# I have to assign priorities based on team involvement (A > B > C, etc)
output <- data.table(
person_ID = c(rep(98723, 4), rep(8534, 2), 11223, rep(22446, 3)),
team = c(rep("A", 4), rep("B", 2), "A", "B", "A", "B"),
start_date = as.Date(c("2009-10-1", "2011-11-21", "2012-1-23", "2013-3-2",
"2009-11-14", "2010-1-1", "2012-1-2", "2011-2-2", "2012-4-3", "2014-10-1")),
end_date = as.Date(c("2010-5-23", NA, "2015-01-02", "2013-3-2", "2009-12-31",
"2010-3-1", "2015-03-22", "2012-4-2", "2014-9-30", "2016-1-2")),
group_id = c(1, rep(2, 3), rep(4, 2), 5, 6, 7, 8))
输出
person_ID team start_date end_date group_id
1: 98723 A 2009-10-01 2010-05-23 1
2: 98723 A 2011-11-21 <NA> 2
3: 98723 A 2012-01-23 2015-01-02 2
4: 98723 A 2013-03-02 2013-03-02 2
5: 8534 B 2009-11-14 2009-12-31 4
6: 8534 B 2010-01-01 2010-03-01 4
7: 11223 A 2012-01-02 2015-03-22 5
8: 22446 B 2011-02-02 2012-04-02 6
9: 22446 A 2012-04-03 2014-09-30 7
10: 22446 B 2014-10-01 2016-01-02 8
简单的MIN / MAX解决方案无效!我不反对单独处理嵌套间隔(参见person_id == 22446),但这确实是我无法弄清楚的部分,同时仍然可以同时满足所有其他目标。
答案 0 :(得分:0)
可能还有其他方法可以更有效地完成此操作。但我认为这比没有好。如果有人带来更好的东西,真棒,我会乐意选择他们的回应。也许我应该通过将其分成几部分来使这更加可口,但如果我知道哪些部分,我首先不必问。 :)无论如何,这是我的解决方案,我会更加精致,因为它会进入我的R包中,但这至少可以回答我到目前为止所测试的所有方法中的问题。
我不得不打破我的意愿,因为这部分包含了循环,其中一部分使用了笛卡尔积。我想不出一种避免它们的方法。(
)overlap_combine <-
function(overlap_dt, id_cols, team_col, start_col, end_col, overlap_int = 1L,
replace_blanks = Sys.Date(), priority_col = "priority") {
setorderv(overlap_dt, c(id_cols, team_col, start_col))
overlap_dt[, end_col := get(end_col) + overlap_int]
sd_cols <- c(start_col, "end_col")
# foverlaps cannot deal with blanks
overlap_dt[is.na(end_col), end_col := replace_blanks]
# note: if end_col becomes < start_col due to overlap_int,
# we assign end_col <- start_col
overlap_dt[end_col - get(start_col) < 0, end_col := start_col]
overlap_dt[, index := .I]
setnames(overlap_dt, start_col, "start_date")
# setnames(overlap_dt, team_col, "team")
# finding overlapping combinations via vectors of indices ---
c_overlap <-
overlap_dt[overlap_dt[, unique(.SD), .SDcols =
c(id_cols, team_col, "start_date", "end_col", "index")],
on = c(id_cols, team_col), allow.cartesian = TRUE]
c_overlap <- c_overlap[i.index != index]
c_overlap[between(i.start_date, start_date, end_col) |
between(i.end_col, start_date, end_col),
ovr_vec := list(list(unique(c(index, i.index)))),
by = c(id_cols, team_col, "start_date")]
ovr_l <- c_overlap[, ovr_vec]
ovr_l <- Filter(Negate(function(x) is.null(unlist(x))), ovr_l)
ovr_l <- unique(ovr_l)
# find list of reduced vectors which we need to MIN/MAX ---
ovr_red_l <- list()
for (i in seq_along(ovr_l)) {
tmp_inter <- unique(unlist(sapply(
ovr_l,
FUN = function(x) {
if (length(intersect(unlist(x), unlist(ovr_l[i]))) > 0) {
result <- union(unlist(x), unlist(ovr_l[i]))
return(result)
} else {
return(ovr_l[i])
}
}
)))
ovr_red_l[[i]] <- sort(tmp_inter)
}
ovr_red_l <- unique(ovr_red_l)
for (i in seq(ovr_red_l)) {
setkey(overlap_dt, index)[ovr_red_l[[i]],
c("start_date", "end_date", "end_col") :=
list(min(start_date), max(end_date), max(end_col))]
}
overlap_dt[, index := NULL]
overlap_dt <- unique(overlap_dt)
setkeyv(overlap_dt, c(id_cols, "start_date"))
# figure out which intervals are nested, separate them out & deal with them
overlap_dt[, index := .I]
overlap_dt[, prior_shift := priority - shift(priority, n = 1), by = id_cols]
overlap_dt[between(shift(start_date, n = 1), start_date, end_col), ovr_shift := 1, by = id_cols]
overlap_dt[shift(ovr_shift) == 1 & prior_shift < 0, nested_ovr := 1]
overlap_dt[index %in% overlap_dt[nested_ovr == 1, index - 1], nested_ovr := 1]
overlap_nested_dt <- overlap_dt[nested_ovr == 1]
overlap_dt <- overlap_dt[is.na(nested_ovr)]
nested_melt_dt <-
melt(overlap_nested_dt, id.vars = c(id_cols, team_col, priority_col),
measure.vars = c("start_date", "end_col"), value.name = "date_value")
setkey(nested_melt_dt, person_ID, date_value)
nested_melt_dt[, boundaries := cumsum(ifelse(variable == "start_date", 1, -1))]
nested_melt_dt[boundaries==1 & variable == "start_date", bound_ok := 1]
nested_melt_dt[boundaries==0 & variable == "end_col", bound_ok := 1]
# create new records that need to be added ---
add_dt <- nested_melt_dt[0, ]
if (nrow(nested_melt_dt)>0) {
for (i in 1:nrow(nested_melt_dt))
if (nested_melt_dt[i, is.na(bound_ok)]) {
if (nested_melt_dt[i, variable] == "start_date") {
add_dt <- rbindlist(list(add_dt,
data.table(nested_melt_dt[i, person_ID],
nested_melt_dt[i-1, team],
nested_melt_dt[i-1, priority],
"end_col",
nested_melt_dt[i, date_value - 1],
NA, NA)))
} else if (nested_melt_dt[i, variable] == "end_col") {
add_dt <- rbindlist(list(add_dt,
data.table(nested_melt_dt[i, person_ID],
nested_melt_dt[i+1, team],
nested_melt_dt[i+1, priority],
"start_date",
nested_melt_dt[i, date_value + 1],
NA, NA)))
}
}
}
# add new records back in
nested_melt_dt <- rbindlist(list(nested_melt_dt, add_dt))
setkeyv(nested_melt_dt, c(id_cols, "date_value"))
nested_melt_dt[, c("bound_ok", "boundaries") := NULL]
nested_melt_dt[, date_group := .I, by = id_cols]
nested_melt_dt[variable == "end_col", date_group := date_group - 1L]
nested_melt_dt <-
dcast(nested_melt_dt, paste0(paste(id_cols, sep = "+"), "+", team_col, "+",
priority_col, "+", "date_group ~ variable"), value.var = "date_value")
nested_melt_dt[, end_date := end_col]
nested_melt_dt[, c("date_group") := NULL]
overlap_dt[, c("index", "prior_shift", "ovr_shift", "nested_ovr") := NULL]
overlap_dt <- rbindlist(list(overlap_dt, nested_melt_dt), use.names = TRUE)
setorderv(overlap_dt, c(id_cols, start_col, team_col))
return(overlap_dt)
}
overlap_combine(overlap_dt = copy(input), id_cols = "person_ID",
team_col = "team", start_col = "start_date", end_col = "end_date",
overlap_int = 1L, replace_blanks = Sys.Date() + 1e3,
priority_col = "priority")
功能输出
person_ID team start_date end_date priority end_col
1: 8534 B 2009-11-14 2010-03-01 2 2010-03-02
2: 11223 A 2012-01-02 2015-03-22 1 2015-03-23
3: 22446 B 2011-02-02 2012-04-02 2 2012-04-02
4: 22446 A 2012-04-03 2014-10-01 1 2014-10-01
5: 22446 B 2014-10-02 2016-01-03 2 2016-01-03
6: 98723 A 2009-10-01 2010-05-23 1 2010-05-24
7: 98723 A 2011-11-21 <NA> 1 2018-12-17