我有一个像医院遭遇的数据集:
VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY HOSP_DISCHRG_DT MED_ORD_ID HAD_FOLLOWUP
1 82919395 8979499 83 2014-09-07 10:47:00 58826846 1
2 82919395 8979499 83 2014-09-07 10:47:00 58826847 1
3 82919395 8979499 83 2014-09-07 10:47:00 58826848 1
4 82919395 8979499 83 2014-09-07 10:47:00 58826845 1
5 77312433 8979499 83 2015-02-01 09:33:00 98525833 1
6 77312433 8979499 83 2015-02-01 09:33:00 98525834 1
7 77312433 8979499 83 2015-02-01 09:33:00 98525835 1
和后续遭遇的数据集如下:
VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY APPT_CHECKIN_DT
1 84273751 8979499 108 2015-02-07 11:57:46
2 83999897 8979499 108 2014-09-13 16:51:22
3 83881023 8979499 108 2014-11-12 10:37:51
4 83999896 8979499 108 2014-11-20 09:23:25
5 95164335 8979499 108 2016-07-27 15:30:25
6 83922326 8979499 108 2014-11-16 09:08:47
我正在尝试将遭遇的APP_CHECKIN_DT的最小值转换为医院遭遇数据集的新字段 FOLLOWUP_DT 。这需要是最小的APP_CHECKIN_DT,它也大于HOSP_DISCHRG_DT。
例如:
最终的医院遭遇数据集如下所示:
VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY HOSP_DISCHRG_DT MED_ORD_ID HAD_FOLLOWUP FOLLOWUP_DT
1 82919395 8979499 83 2014-09-07 10:47:00 58826846 1 2014-09-13 16:51:22
2 82919395 8979499 83 2014-09-07 10:47:00 58826847 1 2014-09-13 16:51:22
3 82919395 8979499 83 2014-09-07 10:47:00 58826848 1 2014-09-13 16:51:22
4 82919395 8979499 83 2014-09-07 10:47:00 58826845 1 2014-09-13 16:51:22
5 77312433 8979499 83 2015-02-01 09:33:00 98525833 1 2015-02-07 11:57:46
6 77312433 8979499 83 2015-02-01 09:33:00 98525834 1 2015-02-07 11:57:46
7 77312433 8979499 83 2015-02-01 09:33:00 98525835 1 2015-02-07 11:57:46
我尝试了一些带有ifelse语句的FOR循环,以便查找遭遇是否有后续,然后获取APPT_CHECKIN_DT,看看医院遇到PAT_KEY是否与门诊遭遇PAT_KEY匹配且APPT_CHECKIN_DT大于HOSP_DISCHRG_DT,以及然后采用最小的APPT_CHECKIN_DT来获得后续日期:
for (i in 1:nrow(children_dx)) {
children_dx$FOLLOW_UP_DATE[i] <-
ifelse(children_dx$HAD_FOLLOWUP[i] == 1,
ifelse(outpatient_visits$APPT_CHECKIN_DT[children_dx$PAT_KEY[i] == outpatient_visits$PAT_KEY] > children_dx$HOSP_DISCHRG_DT[i],
as.character(min(outpatient_visits$APPT_CHECKIN_DT[children_dx$PAT_KEY[i] == outpatient_visits$PAT_KEY])),
NA),NA)
}
但是,运行完整数据集需要很长时间,即使完成,FOLLOWUP_DATE也是整个数据集的APPT_CHECKIN_DT的总体最小值,而不仅仅是PAT_KEY匹配的记录。
答案 0 :(得分:1)
考虑merge
与子集然后aggregate
与 min 然后merge
在原始df上:
数据强>
setClass('myDate')
setAs('character', 'myDate', function(from) as.POSIXct(from, format='%Y-%m-%d %H:%M:%S'))
hospital_encounters <- read.table(text="
VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY HOSP_DISCHRG_DT MED_ORD_ID HAD_FOLLOWUP
1 82919395 8979499 83 '2014-09-07 10:47:00' 58826846 1
2 82919395 8979499 83 '2014-09-07 10:47:00' 58826847 1
3 82919395 8979499 83 '2014-09-07 10:47:00' 58826848 1
4 82919395 8979499 83 '2014-09-07 10:47:00' 58826845 1
5 77312433 8979499 83 '2015-02-01 09:33:00' 98525833 1
6 77312433 8979499 83 '2015-02-01 09:33:00' 98525834 1
7 77312433 8979499 83 '2015-02-01 09:33:00' 98525835 1",
header=TRUE, colClasses = c('numeric', 'numeric', 'numeric', 'numeric', 'myDate', 'numeric', 'numeric'),
stringsAsFactors = FALSE)
follow_up_encounters <- read.table(text=" VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY APPT_CHECKIN_DT
1 84273751 8979499 108 '2015-02-07 11:57:46'
2 83999897 8979499 108 '2014-09-13 16:51:22'
3 83881023 8979499 108 '2014-11-12 10:37:51'
4 83999896 8979499 108 '2014-11-20 09:23:25'
5 95164335 8979499 108 '2016-07-27 15:30:25'
6 83922326 8979499 108 '2014-11-16 09:08:47'",
header=TRUE, colClasses = c('numeric', 'numeric', 'numeric', 'numeric', 'myDate'),
stringsAsFactors = FALSE)
<强>过程强>
mdf <- subset(merge(hospital_encounters, follow_up_encounters[c("PAT_KEY", "APPT_CHECKIN_DT")],
by=c("PAT_KEY")), APPT_CHECKIN_DT > HOSP_DISCHRG_DT)
aggdf <- setNames(aggregate(APPT_CHECKIN_DT~ PAT_KEY + VISIT_KEY, mdf, FUN=min),
c("PAT_KEY", "VISIT_KEY", "FOLLOWUP_DT"))
hospital_encounters <- merge(hospital_encounters, aggdf, c("PAT_KEY", "VISIT_KEY"))
<强>输出强>
hospital_encounters
# PAT_KEY VISIT_KEY DICT_ENC_TYPE_KEY HOSP_DISCHRG_DT MED_ORD_ID HAD_FOLLOWUP FOLLOWUP_DT
# 1 8979499 77312433 83 2015-02-01 09:33:00 98525833 1 2015-02-07 11:57:46
# 2 8979499 77312433 83 2015-02-01 09:33:00 98525834 1 2015-02-07 11:57:46
# 3 8979499 77312433 83 2015-02-01 09:33:00 98525835 1 2015-02-07 11:57:46
# 4 8979499 82919395 83 2014-09-07 10:47:00 58826846 1 2014-09-13 16:51:22
# 5 8979499 82919395 83 2014-09-07 10:47:00 58826847 1 2014-09-13 16:51:22
# 6 8979499 82919395 83 2014-09-07 10:47:00 58826848 1 2014-09-13 16:51:22
# 7 8979499 82919395 83 2014-09-07 10:47:00 58826845 1 2014-09-13 16:51:22
答案 1 :(得分:0)
使用data.table
library(data.table)
#read in the data, convert to data.table using setDT, convert the datetime column into POSIX format, set up a joining key
dt1 <- setDT(read.table(text="VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY HOSP_DISCHRG_DT MED_ORD_ID HAD_FOLLOWUP
82919395 8979499 83 '2014-09-07 10:47:00' 58826846 1
82919395 8979499 83 '2014-09-07 10:47:00' 58826847 1
82919395 8979499 83 '2014-09-07 10:47:00' 58826848 1
82919395 8979499 83 '2014-09-07 10:47:00' 58826845 1
77312433 8979499 83 '2015-02-01 09:33:00' 98525833 1
77312433 8979499 83 '2015-02-01 09:33:00' 98525834 1
77312433 8979499 83 '2015-02-01 09:33:00' 98525835 1", header=TRUE))[,
HOSP_DISCHRG_DT:=strptime(HOSP_DISCHRG_DT, format="%Y-%m-%d %H:%M:%S")][,
KEY_DATE:=HOSP_DISCHRG_DT]
dt2 <- setDT(read.table(text="VISIT_KEY PAT_KEY DICT_ENC_TYPE_KEY APPT_CHECKIN_DT
84273751 8979499 108 '2015-02-07 11:57:46'
83999897 8979499 108 '2014-09-13 16:51:22'
83881023 8979499 108 '2014-11-12 10:37:51'
83999896 8979499 108 '2014-11-20 09:23:25'
95164335 8979499 108 '2016-07-27 15:30:25'
83922326 8979499 108 '2014-11-16 09:08:47'", header=TRUE))[,
APPT_CHECKIN_DT:=strptime(APPT_CHECKIN_DT, format="%Y-%m-%d %H:%M:%S")][,
list(APPT_CHECKIN_DT, KEY_DATE=APPT_CHECKIN_DT)]
#do check out ?data.table to understand the meaning of roll
dt2[dt1, on="KEY_DATE", roll=-Inf][,
KEY_DATE:=NULL] #remove the joining key