合并两个数据框,保留所有日期

时间:2019-04-10 09:25:43

标签: r date dataframe datatable

我有两个带有日期的数据框(大的)。我想合并那些通过sub_id保留所有日期的数据框,以创建尽可能完整的时间轴,但是时间轴有一条规则。 如果其中一个SUB_ID的记录在df_02中,则需要替换该SUB_ID的df_01的日期,从两个df相同的第一个END开始(从df_02开始的第一个可用日期)

例如,在此示例中,我需要从226699开始用SUB_ID 40843342替换2006-12-31个日期。

我添加有问题的案例的dput()

df_01:

structure(list(ID = c(81, 226699, 226699, 226699, 226699, 226699, 
81, 81, 81, 81, 81, 226699, 226699, 226699, 226699, 226699, 226699, 
226699, 226699, 226699, 226699, 226699, 226699, 226699, 81, 81, 
81), SUB_ID = c(99026150L, 40843342L, 40843342L, 40843342L, 40843342L, 
40843342L, 40816464L, 40816464L, 40816464L, 40816464L, 40816464L, 
27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 
27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 27415546L, 
27415546L, 144910L, 144910L, 68340L), TYPE = c("1", "B", "B", 
"B", "B", "4", "1", "1", "C", "1", "1", "1", "1", "1", "1", "1", 
"A", "A", "A", "A", "C", "1", "1", "1", "1", "1", "1"), END = structure(c(16283, 
16678, 16313, 16225, 15278, 13513, 16343, 16313, 16282, 14699, 
14244, 2932896, 17837, 17378, 17166, 17074, 16678, 13969, 13725, 
13603, 13452, 13268, 13238, 13148, 2932896, 17712, 17531), class = "Date"), 
    START = structure(c(15065, 16314, 16226, 15279, 13514, 11778, 
    16314, 16283, 14700, 14245, 13514, 17838, 17379, 17167, 17075, 
    16679, 13970, 13726, 13604, 13453, 13269, 13239, 13149, 12874, 
    17713, 17532, 16344), class = "Date"), VALUE = c(3L, 3L, 
    1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), ORIGIN = c(0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0)), row.names = c(429970L, 2542937L, 2542964L, 
2542994L, 2543028L, 2543070L, 2810465L, 2810487L, 2810509L, 2810540L, 
2810565L, 5501774L, 5501800L, 5501828L, 5501856L, 5501882L, 5501912L, 
5501949L, 5501976L, 5502003L, 5502030L, 5502057L, 5502084L, 5502111L, 
14426231L, 14426236L, 15819358L), class = "data.frame")

df_02:

structure(list(ID= c(226699, 226699, 81, 81, 81, 81, 81, 
81, 81, 81, 81, 81, 81), SUB_ID= c(40843342L, 40843342L, 40816464L, 
40816464L, 40816464L, 40816464L, 40816464L, 40816464L, 40816464L, 
40816464L, 40816464L, 40816464L, 40816464L), TYPE= c("B", "4", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), END= structure(c(13513, 
11992, 13513, 12325, 11991, 11960, 11777, 11503, 11473, 11412, 
11322, 11261, 10591), class = "Date"), START= structure(c(13453, 
11778, 12326, 11992, 11961, 11778, 11504, 11474, 11413, 11323, 
11262, 10592, 10317), class = "Date"), VALUE= c(3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), ORIGIN = c(1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(207418L, 207420L, 
250797L, 250798L, 250799L, 250800L, 250801L, 250802L, 250803L, 
250804L, 250805L, 250806L, 250807L), class = "data.frame")

如果我使用rbind(df_01,df_02),则ID 81会完成,但是ID 226699的数据会受到污染(我不会消除2006-12-31之后的日期)。另一方面,如果我加入并应用标志逻辑,则获得226699的正确时间表,但丢失了81

的日期

这是我一直在使用的标志逻辑:

df_final <- dplyr::full_join(df_01,df_02,by=c("ID","SUB_ID"))
df_final$flag <- ifelse(is.na(df_final$END.y),0,ifelse(df_final$END.x == df_final$END.y,1,0))
df_final <- df_final%>%group_by(SUB_ID)%>%mutate(dflag = cummax(flag == 1))
df_final$END_FINAL <-ifelse(df_final$dflag ==0,df_final$END.x,df_final$END.y) 
df_final$START_FINAL <- ifelse(df_final$dflag==0,df_final$START.x,df_final$START.y)

最后,数据框应包含df_01和df_02的所有日期的组合,如果df_02中的任何END与df_01中的END相同,则从那一刻起,对于该SUB_ID,以下日期应该到来仅来自df_02:

基本示例,为简洁起见。 (按SUB_ID,START降序排列)

df_final <- df_final[order(df_final$SUB_ID,df_final$START,decreasing = 
TRUE),]
df_final

ID     | SUB_ID   | TYPE | END         | START       | VALUE | ORIGIN
81     | 99026150 | 1    | 2014-08-01  | 2011-04-01  | 3     | 0  <-- df_01
81     | 40816464 | 1    | 2014-09-30  | 2014-09-01  | 3     | 0  <-- df_01
... The combination of df_01 and df_02 dates for this SUB_ID ....
... Ending with the last record for this SUB_ID in df_02 ........
81     | 40816464 | 1    | 1998-12-31  | 1998-04-01  | 3     | 1  <-- df_02
... The rest of the dates for this ID (that are in df_01) .......
226699 | 40843342 | 1    | 2015-08-31  | 2014-09-01  | 3     | 0  <-- df_01
... All the dates for this SUB_ID until it reaches 2006-12-31 ...
... Instead of ending with 2006-12-31 2002-04-01 ................
... it should end with the two lines of df_02 ...................
226699 | 40843342 | B    | 2006-12-31  | 2006-11-01  | 3     | 1  <-- df_02
226699 | 40843342 | 4    | 2002-11-01  | 2002-04-01  | 3     | 1  <-- df_02
... The rest of the dates for this ID (that are in df_01) .......

任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

您可以尝试类似的操作:

require(tidyverse)
#nest data by SUB_ID
df_01 < - nest(df_01, -SUB_ID)
df_02 <- nest(df_02, -SUB_ID)

#Merge data by SUB_ID
df <- merge(df_01, df_02, by = "SUB_ID", all = TRUE) %>%
#if I have data for df_02 keep that data, if not,  keep data from df_01
  mutate(keep = ifelse(is.na(data.y) == TRUE, data.x, data.y)) %>% 
#remove data from each original df_01/02
  select(-data.x, -data.y) %>% 
#nest the data to keep
  unnest(keep)

此外,如果您想从df_01df_02的原始数据中“获取”某些信息,则可以构建一个函数并通过purrr::map2()