将日期之间的两个数据框连接在一起

时间:2019-10-12 13:44:13

标签: r

我正在尝试使用Fuzzyjoin或类似方法将两个数据帧连接在一起。

我想通过df2df1IDdate_EOM加入date_EOM,但在date_EOMdf1之间加入date_EOM_plus_3df2中的df1

也就是说,df2较长,具有更多时间序列数据,我想将其与较短的df1合并,其中来自date_EOM的数据在date_EOM_plus_3df2中的library(fuzzyjoin) df2 %>% fuzzy_left_join(df1, by = c("ID" = "ID", "date_EOM" = "date_EOM", "date_EOM" = "date_EOM_plus_3"), match_fun = list("=", ">=", "<=") )

以下内容无效:

> head(df1)
    date_EOM       ID var_1
1 2015-04-30 09627Y10 71577
2 2015-05-31 09627Y10 64829
3 2015-06-30 09627Y10 79008
4 2015-07-31 09627Y10 24319
5 2015-08-31 09627Y10 24271
6 2015-09-30 09627Y10 38051


> head(df2)
# A tibble: 6 x 4
  ID       date_EOM   date_EOM_plus_3 var_2
  <chr>    <date>     <date>          <dbl>
1 26864810 2008-02-29 2008-05-31          1
2 26864810 2009-03-31 2009-06-30          2
3 26864810 2009-02-28 2009-05-31          2
4 26864810 2010-02-28 2010-05-31          1
5 26864810 2011-02-28 2011-05-31          1
6 26864810 2012-02-29 2012-05-31          1

数据

df1 <- structure(list(date_EOM = structure(c(16555, 16586, 16616, 16647, 
16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921, 
16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166, 17197, 
17225, 17256, 17286, 17317, 17347, 17378, 17409, 17439, 17470, 
17500, 17531, 17562, 17590, 17621, 17651, 17682, 17712, 17743, 
17774, 17804, 17835, 17865, 17896, 12814, 12842, 12873, 12903, 
12934, 12964, 12995, 13026, 13056, 13087, 13117, 13148, 13179, 
13207, 13238, 13268, 13299, 13329, 13360, 13391, 13421, 13452, 
13482, 13513, 13544, 13572, 13603, 13633, 13664, 13694, 13725, 
13756, 13786, 13817, 13847, 13878, 13909, 13938, 13969, 13999, 
14030, 14060, 14091, 14122, 14152, 14183, 14213, 14244, 14275, 
14303, 14334, 14364, 14395, 14425, 14456, 14487, 14517, 14548, 
14578, 14609, 14640, 14668, 14699, 14729, 14760, 14790, 14821, 
14852, 14882, 14913, 14943, 14974, 15005, 15033, 15064, 15094, 
15125, 15155, 15186, 15217, 15247, 15278, 15308, 15339, 15370, 
15399, 15430, 15460, 15491, 15521, 15552, 15583, 15613, 15644, 
15674, 15705, 15736, 15764, 15795, 15825, 15856, 15886, 15917, 
15948, 15978, 16009, 16039, 16070, 16101, 16129, 16160, 16190, 
16221, 16251, 16282, 16313, 16343, 16374, 16404, 16435, 16466, 
16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739, 
16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013, 
17044, 17074), class = "Date"), ID = c("09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", 
"09627Y10", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810"), var_1 = c(71577, 
64829, 79008, 24319, 24271, 38051, 36962, 57471, 53909, 42452, 
30679, 38091, 28095, 32294, 51117, 24724, 42720, 51312, 53133, 
55767, 95558, 63798, 65024, 147838, 83441, 71575, 147199, 78138, 
80006, 96524, 73523, 80160, 148519, 66447, 64899, 78689, 83721, 
116659, 146079, 73399, 77594, 55159, 90624, 89813, 64276, 3201253, 
2431312, 2597968, 2812961, 2246178, 2495002, 2685559, 2231979, 
3082188, 3210950, 2604852, 2863003, 4617400, 3317902, 3815995, 
2988183, 3389021, 5442709, 5431740, 4743099, 3515196, 4096597, 
6025625, 5252737, 6420185, 5342544, 6022201, 5861288, 6890111, 
6390106, 8151154, 11150273, 7440683, 11327526, 11461364, 5595098, 
12380073, 7310007, 6750283, 6652174, 7212304, 5581204, 9771562, 
4738422, 7909627, 9548136, 5429511, 4897759, 5417455, 5469542, 
6537099, 6336852, 4924378, 5408494, 5935821, 4036994, 4251811, 
5204948, 3745676, 4145843, 6015356, 3820903, 5008049, 4845117, 
5729854, 5149140, 5955255, 5512172, 5449250, 6016798, 4259770, 
3022433, 5331361, 4667700, 4916282, 3993569, 3727907, 4159248, 
3186004, 7862443, 4557679, 5054754, 4148564, 4493250, 4980311, 
3766246, 4152900, 3763739, 4553546, 4453020, 3865450, 3444880, 
3029692, 4606733, 3513674, 3308547, 6820762, 3784315, 4498774, 
5237598, 5125980, 4534635, 3831884, 2759388, 3046901, 5864084, 
3768261, 5113238, 5457462, 4306425, 4536429, 4226480, 2695787, 
2697229, 4304343, 2516059, 3771647, 3644023, 2166936, 2776204, 
3069746, 2472952, 3897729, 3710804, 2530741, 2794476, 3500625, 
3806155, 3020445, 6917279, 2540017, 2363408, 3227050, 2651100, 
2046093, 2685440, 2559308, 2642814, 2834369, 3321310, 1695951
)), row.names = c(NA, -186L), class = "data.frame")

数据1:

df2 <- structure(list(ID = c("26864810", "26864810", "26864810", "26864810", 
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810", 
"09627Y10", "09627Y10"), date_EOM = structure(c(13938, 14334, 
14303, 14668, 15033, 15399, 15764, 16129, 16494, 16860, 17256, 
17590), class = "Date"), date_EOM_plus_3 = structure(c(14030, 
14425, 14395, 14760, 15125, 15491, 15856, 16221, 16586, 16952, 
17347, 17682), class = "Date"), var_2 = c(1, 2, 2, 1, 1, 1, 3, 
1, 4, 2, 3, 3)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-12L))

数据2:

<Button

df2 <-

2 个答案:

答案 0 :(得分:1)

我不确定我理解最终数据帧的外观,但是我认为lubridate的{​​{1}}函数应该带您到那里。

interval

简而言之,library("tidyverse") library("lubridate") df2_interval <- interval(start = df2$date_EOM, end = df2$date_EOM_plus_3) df1 %>% filter(date_EOM %within% as.list(df2_interval)) 检查给定日期是否在上一个函数创建的间隔内。

这仅过滤%within%数据范围之一中包含的df1的行。此时,您可以继续执行df2(如果您打算这样做的话),或者只用lef_join()创建一个虚拟变量,然后从那里获取。

答案 1 :(得分:1)

您非常接近,但是您的代码存在三个问题:

  1. match_fun()中的功能需要反引号(`),而不是引号(“或')。
  2. “等于”比较器函数是==而不是=
  3. 要将df2加入df1,您需要df1 %>% fuzzy_left_join(df2, …),而不是df2 %>% fuzzy_left_join(df1, …)。还有其他方法可以做到这一点,但是为了简单起见,我们应该只切换df1df2

以下似乎可以完成这项工作:

df1 %>%
    fuzzy_left_join(df2,
                    by = c("ID" = "ID",
                           "date_EOM" = "date_EOM",
                           "date_EOM" = "date_EOM_plus_3"),
                    match_fun = list(`==`, `>=`, `<=`)
    )