我正在尝试使用Fuzzyjoin或类似方法将两个数据帧连接在一起。
我想通过df2
和df1
将ID
和date_EOM
加入date_EOM
,但在date_EOM
中df1
之间加入date_EOM_plus_3
和df2
中的df1
。
也就是说,df2
较长,具有更多时间序列数据,我想将其与较短的df1
合并,其中来自date_EOM
的数据在date_EOM_plus_3
和df2
中的library(fuzzyjoin)
df2 %>%
fuzzy_left_join(df1,
by = c("ID" = "ID",
"date_EOM" = "date_EOM",
"date_EOM" = "date_EOM_plus_3"),
match_fun = list("=", ">=", "<=")
)
。
以下内容无效:
> head(df1)
date_EOM ID var_1
1 2015-04-30 09627Y10 71577
2 2015-05-31 09627Y10 64829
3 2015-06-30 09627Y10 79008
4 2015-07-31 09627Y10 24319
5 2015-08-31 09627Y10 24271
6 2015-09-30 09627Y10 38051
> head(df2)
# A tibble: 6 x 4
ID date_EOM date_EOM_plus_3 var_2
<chr> <date> <date> <dbl>
1 26864810 2008-02-29 2008-05-31 1
2 26864810 2009-03-31 2009-06-30 2
3 26864810 2009-02-28 2009-05-31 2
4 26864810 2010-02-28 2010-05-31 1
5 26864810 2011-02-28 2011-05-31 1
6 26864810 2012-02-29 2012-05-31 1
数据
df1 <- structure(list(date_EOM = structure(c(16555, 16586, 16616, 16647,
16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921,
16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166, 17197,
17225, 17256, 17286, 17317, 17347, 17378, 17409, 17439, 17470,
17500, 17531, 17562, 17590, 17621, 17651, 17682, 17712, 17743,
17774, 17804, 17835, 17865, 17896, 12814, 12842, 12873, 12903,
12934, 12964, 12995, 13026, 13056, 13087, 13117, 13148, 13179,
13207, 13238, 13268, 13299, 13329, 13360, 13391, 13421, 13452,
13482, 13513, 13544, 13572, 13603, 13633, 13664, 13694, 13725,
13756, 13786, 13817, 13847, 13878, 13909, 13938, 13969, 13999,
14030, 14060, 14091, 14122, 14152, 14183, 14213, 14244, 14275,
14303, 14334, 14364, 14395, 14425, 14456, 14487, 14517, 14548,
14578, 14609, 14640, 14668, 14699, 14729, 14760, 14790, 14821,
14852, 14882, 14913, 14943, 14974, 15005, 15033, 15064, 15094,
15125, 15155, 15186, 15217, 15247, 15278, 15308, 15339, 15370,
15399, 15430, 15460, 15491, 15521, 15552, 15583, 15613, 15644,
15674, 15705, 15736, 15764, 15795, 15825, 15856, 15886, 15917,
15948, 15978, 16009, 16039, 16070, 16101, 16129, 16160, 16190,
16221, 16251, 16282, 16313, 16343, 16374, 16404, 16435, 16466,
16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739,
16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013,
17044, 17074), class = "Date"), ID = c("09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10", "09627Y10",
"09627Y10", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810"), var_1 = c(71577,
64829, 79008, 24319, 24271, 38051, 36962, 57471, 53909, 42452,
30679, 38091, 28095, 32294, 51117, 24724, 42720, 51312, 53133,
55767, 95558, 63798, 65024, 147838, 83441, 71575, 147199, 78138,
80006, 96524, 73523, 80160, 148519, 66447, 64899, 78689, 83721,
116659, 146079, 73399, 77594, 55159, 90624, 89813, 64276, 3201253,
2431312, 2597968, 2812961, 2246178, 2495002, 2685559, 2231979,
3082188, 3210950, 2604852, 2863003, 4617400, 3317902, 3815995,
2988183, 3389021, 5442709, 5431740, 4743099, 3515196, 4096597,
6025625, 5252737, 6420185, 5342544, 6022201, 5861288, 6890111,
6390106, 8151154, 11150273, 7440683, 11327526, 11461364, 5595098,
12380073, 7310007, 6750283, 6652174, 7212304, 5581204, 9771562,
4738422, 7909627, 9548136, 5429511, 4897759, 5417455, 5469542,
6537099, 6336852, 4924378, 5408494, 5935821, 4036994, 4251811,
5204948, 3745676, 4145843, 6015356, 3820903, 5008049, 4845117,
5729854, 5149140, 5955255, 5512172, 5449250, 6016798, 4259770,
3022433, 5331361, 4667700, 4916282, 3993569, 3727907, 4159248,
3186004, 7862443, 4557679, 5054754, 4148564, 4493250, 4980311,
3766246, 4152900, 3763739, 4553546, 4453020, 3865450, 3444880,
3029692, 4606733, 3513674, 3308547, 6820762, 3784315, 4498774,
5237598, 5125980, 4534635, 3831884, 2759388, 3046901, 5864084,
3768261, 5113238, 5457462, 4306425, 4536429, 4226480, 2695787,
2697229, 4304343, 2516059, 3771647, 3644023, 2166936, 2776204,
3069746, 2472952, 3897729, 3710804, 2530741, 2794476, 3500625,
3806155, 3020445, 6917279, 2540017, 2363408, 3227050, 2651100,
2046093, 2685440, 2559308, 2642814, 2834369, 3321310, 1695951
)), row.names = c(NA, -186L), class = "data.frame")
数据1:
df2 <- structure(list(ID = c("26864810", "26864810", "26864810", "26864810",
"26864810", "26864810", "26864810", "26864810", "26864810", "26864810",
"09627Y10", "09627Y10"), date_EOM = structure(c(13938, 14334,
14303, 14668, 15033, 15399, 15764, 16129, 16494, 16860, 17256,
17590), class = "Date"), date_EOM_plus_3 = structure(c(14030,
14425, 14395, 14760, 15125, 15491, 15856, 16221, 16586, 16952,
17347, 17682), class = "Date"), var_2 = c(1, 2, 2, 1, 1, 1, 3,
1, 4, 2, 3, 3)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L))
数据2:
<Button
df2 <-
答案 0 :(得分:1)
我不确定我理解最终数据帧的外观,但是我认为lubridate
的{{1}}函数应该带您到那里。
interval
简而言之,library("tidyverse")
library("lubridate")
df2_interval <- interval(start = df2$date_EOM,
end = df2$date_EOM_plus_3)
df1 %>%
filter(date_EOM %within% as.list(df2_interval))
检查给定日期是否在上一个函数创建的间隔内。
这仅过滤%within%
数据范围之一中包含的df1
的行。此时,您可以继续执行df2
(如果您打算这样做的话),或者只用lef_join()
创建一个虚拟变量,然后从那里获取。
答案 1 :(得分:1)
您非常接近,但是您的代码存在三个问题:
match_fun()
中的功能需要反引号(`),而不是引号(“或')。==
而不是=
。df2
加入df1
,您需要df1 %>% fuzzy_left_join(df2, …)
,而不是df2 %>% fuzzy_left_join(df1, …)
。还有其他方法可以做到这一点,但是为了简单起见,我们应该只切换df1
和df2
。以下似乎可以完成这项工作:
df1 %>%
fuzzy_left_join(df2,
by = c("ID" = "ID",
"date_EOM" = "date_EOM",
"date_EOM" = "date_EOM_plus_3"),
match_fun = list(`==`, `>=`, `<=`)
)