我想在2017年获得几个日期间隔的缺失部分。
例如,以下数据帧的每个“ id”:
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
id start end
a 2017-01-01 2017-01-15
a 2017-01-10 2017-01-20
a 2017-02-10 2017-02-20
b 2017-03-01 2017-03-28
b 2017-04-20 2017-04-29
我想得到:
df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))
id start end
a 2017-01-21 2017-02-09
a 2017-02-21 2017-12-31
b 2017-01-01 2017-02-28
b 2017-03-29 2017-04-19
b 2017-04-30 2017-12-31
谢谢!
答案 0 :(得分:2)
首先,确认start
和end
是否为日期课。
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
使用by()
根据ID将数据分成两个数据帧的列表。
library(purrr)
by(df, df$id, function(x){
year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
ind <- map2(x$start, x$end, function(start, end){
which(year < start | year > end)
}) %>% reduce(intersect)
gap <- which(diff(ind) > 1)
head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)
说明:
start
和end
之间的日期,结果表示缺少日期的索引。输出:
# id start end
# 1 a 2017-01-21 2017-02-09
# 2 a 2017-02-21 2017-12-31
# 3 b 2017-01-01 2017-02-28
# 4 b 2017-03-29 2017-04-19
# 5 b 2017-04-30 2017-12-31
我认为我的解决方案仍然很麻烦。希望能帮助您。
答案 1 :(得分:0)
我最近遇到了类似的问题,我发现将表扩展为在每个相关日期获得一行,然后折叠回范围,比尝试仅从范围端点得出正确的逻辑要容易得多。
这是该方法的工作方式。另外,也可以执行类似this或this的操作,但是这些方法不会出现您正在处理的“不在范围内”的问题。
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))
答案 2 :(得分:0)
这是我的解决方法:
library(tidyverse)
library(lubridate)
library(wrapr)
df %>%
mutate_at(2:3, ymd) %>%
group_by(id) %>%
gather('start_end', 'date', start:end) %>%
mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
unique() %>%
mutate(
start = if_else(
start_end == 'start',
date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
date
),
end = if_else(
start_end == 'end',
date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
date
)) %>%
filter(start != end) %>%
select(id, start, end) %>%
mutate(supp = TRUE) %>%
bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
arrange(id, start) %>%
mutate(rn = row_number()) %.>%
left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
na.omit() %>%
mutate(
start = case_when(
(start.y >= end.x) & !supp.x ~ end.x + 1,
(start.y >= end.x) & supp.x ~ start.x,
TRUE ~ as.Date(NA)
),
end = case_when(
(start.y >= end.x) & supp.y ~ end.y,
(start.y >= end.x) & !supp.y ~ start.y - 1,
TRUE ~ as.Date(NA)
)
) %>%
select(id, start, end) %>%
na.omit()