我有两个带有日期的数据框(大的)。我想合并那些通过sub_id保留所有日期的数据框,以创建尽可能完整的时间轴,但是时间轴有一条规则。 如果其中一个SUB_ID的记录在df_02中,则需要替换该SUB_ID的df_01的日期,从两个df相同的第一个END开始(从df_02开始的第一个可用日期)。
例如,在此示例中,我需要从226699
开始用SUB_ID 40843342
替换2006-12-31
个日期。
我添加有问题的案例的dput()
:
df_01:
structure(list(ID = c(81, 226699, 226699, 226699, 226699, 226699,
81, 81, 81, 81, 81, 226699, 226699, 226699, 226699, 226699, 226699,
226699, 226699, 226699, 226699, 226699, 226699, 226699, 81, 81,
81), SUB_ID = c(99026150L, 40843342L, 40843342L, 40843342L, 40843342L,
40843342L, 40816464L, 40816464L, 40816464L, 40816464L, 40816464L,
27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 27415546L,
27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 27415546L,
27415546L, 144910L, 144910L, 68340L), TYPE = c("1", "B", "B",
"B", "B", "4", "1", "1", "C", "1", "1", "1", "1", "1", "1", "1",
"A", "A", "A", "A", "C", "1", "1", "1", "1", "1", "1"), END = structure(c(16283,
16678, 16313, 16225, 15278, 13513, 16343, 16313, 16282, 14699,
14244, 2932896, 17837, 17378, 17166, 17074, 16678, 13969, 13725,
13603, 13452, 13268, 13238, 13148, 2932896, 17712, 17531), class = "Date"),
START = structure(c(15065, 16314, 16226, 15279, 13514, 11778,
16314, 16283, 14700, 14245, 13514, 17838, 17379, 17167, 17075,
16679, 13970, 13726, 13604, 13453, 13269, 13239, 13149, 12874,
17713, 17532, 16344), class = "Date"), VALUE = c(3L, 3L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), ORIGIN = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0)), row.names = c(429970L, 2542937L, 2542964L,
2542994L, 2543028L, 2543070L, 2810465L, 2810487L, 2810509L, 2810540L,
2810565L, 5501774L, 5501800L, 5501828L, 5501856L, 5501882L, 5501912L,
5501949L, 5501976L, 5502003L, 5502030L, 5502057L, 5502084L, 5502111L,
14426231L, 14426236L, 15819358L), class = "data.frame")
df_02:
structure(list(ID= c(226699, 226699, 81, 81, 81, 81, 81,
81, 81, 81, 81, 81, 81), SUB_ID= c(40843342L, 40843342L, 40816464L,
40816464L, 40816464L, 40816464L, 40816464L, 40816464L, 40816464L,
40816464L, 40816464L, 40816464L, 40816464L), TYPE= c("B", "4",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), END= structure(c(13513,
11992, 13513, 12325, 11991, 11960, 11777, 11503, 11473, 11412,
11322, 11261, 10591), class = "Date"), START= structure(c(13453,
11778, 12326, 11992, 11961, 11778, 11504, 11474, 11413, 11323,
11262, 10592, 10317), class = "Date"), VALUE= c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), ORIGIN = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(207418L, 207420L,
250797L, 250798L, 250799L, 250800L, 250801L, 250802L, 250803L,
250804L, 250805L, 250806L, 250807L), class = "data.frame")
如果我使用rbind(df_01,df_02)
,则ID 81会完成,但是ID 226699
的数据会受到污染(我不会消除2006-12-31
之后的日期)。另一方面,如果我加入并应用标志逻辑,则获得226699
的正确时间表,但丢失了81
这是我一直在使用的标志逻辑:
df_final <- dplyr::full_join(df_01,df_02,by=c("ID","SUB_ID"))
df_final$flag <- ifelse(is.na(df_final$END.y),0,ifelse(df_final$END.x == df_final$END.y,1,0))
df_final <- df_final%>%group_by(SUB_ID)%>%mutate(dflag = cummax(flag == 1))
df_final$END_FINAL <-ifelse(df_final$dflag ==0,df_final$END.x,df_final$END.y)
df_final$START_FINAL <- ifelse(df_final$dflag==0,df_final$START.x,df_final$START.y)
最后,数据框应包含df_01和df_02的所有日期的组合,如果df_02中的任何END与df_01中的END相同,则从那一刻起,对于该SUB_ID,以下日期应该到来仅来自df_02:
基本示例,为简洁起见。
(按SUB_ID,START降序排列)
df_final <- df_final[order(df_final$SUB_ID,df_final$START,decreasing =
TRUE),]
df_final
ID | SUB_ID | TYPE | END | START | VALUE | ORIGIN
81 | 99026150 | 1 | 2014-08-01 | 2011-04-01 | 3 | 0 <-- df_01
81 | 40816464 | 1 | 2014-09-30 | 2014-09-01 | 3 | 0 <-- df_01
... The combination of df_01 and df_02 dates for this SUB_ID ....
... Ending with the last record for this SUB_ID in df_02 ........
81 | 40816464 | 1 | 1998-12-31 | 1998-04-01 | 3 | 1 <-- df_02
... The rest of the dates for this ID (that are in df_01) .......
226699 | 40843342 | 1 | 2015-08-31 | 2014-09-01 | 3 | 0 <-- df_01
... All the dates for this SUB_ID until it reaches 2006-12-31 ...
... Instead of ending with 2006-12-31 2002-04-01 ................
... it should end with the two lines of df_02 ...................
226699 | 40843342 | B | 2006-12-31 | 2006-11-01 | 3 | 1 <-- df_02
226699 | 40843342 | 4 | 2002-11-01 | 2002-04-01 | 3 | 1 <-- df_02
... The rest of the dates for this ID (that are in df_01) .......
任何帮助将不胜感激。
答案 0 :(得分:0)
您可以尝试类似的操作:
require(tidyverse)
#nest data by SUB_ID
df_01 < - nest(df_01, -SUB_ID)
df_02 <- nest(df_02, -SUB_ID)
#Merge data by SUB_ID
df <- merge(df_01, df_02, by = "SUB_ID", all = TRUE) %>%
#if I have data for df_02 keep that data, if not, keep data from df_01
mutate(keep = ifelse(is.na(data.y) == TRUE, data.x, data.y)) %>%
#remove data from each original df_01/02
select(-data.x, -data.y) %>%
#nest the data to keep
unnest(keep)
此外,如果您想从df_01
或df_02
的原始数据中“获取”某些信息,则可以构建一个函数并通过purrr::map2()