我有一个数据框,其中包含带有患者ID和日期的入院事件。
问题
我想合并HospNum_Id
与上一行相同且两行之间的日期差大于3天的任何行。
输入
综合数据集如下所示:
structure(list(HospNum_Id = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A791697", "V682805", "X608693"
), class = "factor"), VisitDate = structure(c(17181, 17183, 17192,
17168, 17169, 17186, 17189, 17212, 17215, 17167, 17173, 17190
), class = "Date"), diffDate = structure(c(-2, -9, NA, -1, -17,
-3, -23, -3, NA, -6, -17, NA), class = "difftime", units = "days")), .Names = c("HospNum_Id",
"VisitDate", "diffDate"), row.names = c(NA, -12L), class = "data.frame")
我的尝试
我已采取的步骤是
Mydf<-Mydf[order(Mydf$HospNum_Id,Mydf$VisitDate),]
library(rlang)
library(dplyr)
SurveilTimeByRow <-
function(Mydf, HospNum_Id, VisitDate) {
HospNum_Ida <- sym(HospNum_Id)
VisitDatea <- sym(VisitDate)
ret<-dataframe %>% arrange(!!HospNum_Ida,!!VisitDatea) %>%
group_by(!!HospNum_Ida) %>%
mutate(diffDate = difftime(as.Date(!!VisitDatea), lead(as.Date(
!!VisitDatea
), 1), units = "days"))
dataframe<-data.frame(ret)
return(dataframe)
}
Mydf<-SurveilTimeByRow(try,"HospNum_Id","VisitDate")
这是我坚持的部分。
所需的输出
HospNum_Id VisitDate diffDate HospNum_Id.1 VisitDate.1 diffDate.1
A791697 2017-01-15 -2 days A791697 2017-01-17 -9 days
V682805 2017-01-02 -1 days V682805 2017-01-03 -17 days
V682805 2017-01-20 -3 days V682805 2017-01-23 -23 days
V682805 2017-02-15 -3 days V682805 2017-02-18 NA days
我将摆脱最后一列difftime.1,该列最后将是多余的
答案 0 :(得分:2)
以下是使用您发布为df
的数据的一种可能的解决方案:
library(tidyverse)
# create an id to flag consecutive rows within each HospNum
df %>%
group_by(HospNum_Id) %>%
mutate(id = ceiling(row_number() / 2)) %>%
ungroup() -> df2
# split to even and odd rows within each HospNum
df_odd = df2 %>% group_by(HospNum_Id) %>% filter(row_number() %in% seq(1, nrow(df2), 2)) %>% ungroup()
df_even = df2 %>% group_by(HospNum_Id) %>% filter(row_number() %in% seq(2, nrow(df2), 2)) %>% ungroup()
# join on both ids and remove rows
inner_join(df_odd, df_even, by=c("id","HospNum_Id")) %>%
filter(between(diffDate.x, -3, 3) & !is.na(diffDate.y)) %>%
select(-id)
# # A tibble: 3 x 5
# HospNum_Id VisitDate.x diffDate.x VisitDate.y diffDate.y
# <fct> <date> <time> <date> <time>
# 1 A791697 2017-01-15 -2 days 2017-01-17 " -9 days"
# 2 V682805 2017-01-02 -1 days 2017-01-03 -17 days
# 3 V682805 2017-01-20 -3 days 2017-01-23 -23 days
您将上述逻辑结合在一个管道链中,如下所示:
df %>%
group_by(HospNum_Id) %>%
mutate(id = ceiling(row_number() / 2),
even_row = row_number() %in% seq(2, nrow(df), 2)) %>%
ungroup() %>%
nest(-even_row) %>%
pull(data) %>%
reduce(function(x,y) inner_join(x,y,by=c("id","HospNum_Id"))) %>%
filter(between(diffDate.x, -3, 3) & !is.na(diffDate.y)) %>%
select(-id)