假设我有两个看起来像这样的数据框:
df1 = structure(list(X1 = c(0.659588465514883, 0.47368422669833, -0.0422047052887636,
-1.75642936005977, 0.339813114272074, 1.09341750942405, 0.327672990051479,
-0.893507823167616, -0.661285321563594, -0.569673784617002, -0.983369868281376,
-2.53659592825309, 0.396220995581641, -1.1994504350227, -0.553343957714012,
1.30884516680972, -0.120561033997931, 0.971506981390537, 0.815610612704566,
1.53103368033727, -0.808956975392184, -1.27332589061096, -1.89082047917723,
0.249755375966669, -0.704051599213331), X2 = c(0.659588465514883,
0.47368422669833, -0.0422047052887636, -1.75642936005977, 0.339813114272074,
1.09341750942405, 0.327672990051479, -0.893507823167616, -0.661285321563594,
-0.569673784617002, -0.983369868281376, -2.53659592825309, 0.396220995581641,
-1.1994504350227, -0.553343957714012, 1.30884516680972, -0.120561033997931,
0.971506981390537, 0.815610612704566, 1.53103368033727, -0.808956975392184,
-1.27332589061096, -1.89082047917723, 0.249755375966669, -0.704051599213331
), Date = structure(c(10957,
10988, 11017, 11048, 11078, 11109, 11139, 11170, 11201, 11231,
11262, 11292, 11323, 11354, 11382, 11413, 11443, 11474, 11504,
11535, 11566, 11596, 11627, 11657, 11688), class = "Date")), class = "data.frame", row.names = c(NA,
-25L))
X1 X2
1 -1.633636896 -1.633636896
2 1.793766808 1.793766808
3 0.440697771 0.440697771
4 0.330091148 0.330091148
5 -1.234246285 -1.234246285
6 0.044951993 0.044951993
7 -2.831295687 -2.831295687
8 -0.735371579 -0.735371579
9 -0.412580789 -0.412580789
10 0.001848622 0.001848622
11 1.480684731 1.480684731
12 -1.088999830 -1.088999830
13 -0.465903929 -0.465903929
14 -0.010743010 -0.010743010
15 1.420995930 1.420995930
16 -0.789190729 -0.789190729
17 -0.750476176 -0.750476176
18 -0.314079067 -0.314079067
19 -0.324779959 -0.324779959
20 -1.192471909 -1.192471909
21 -0.170325813 -0.170325813
22 0.890941125 0.890941125
23 0.863875448 0.863875448
24 -0.088048086 -0.088048086
25 0.021239226 0.021239226
Date
1 2000-01-01
2 2000-02-01
3 2000-03-01
4 2000-04-01
5 2000-05-01
6 2000-06-01
7 2000-07-01
8 2000-08-01
9 2000-09-01
10 2000-10-01
11 2000-11-01
12 2000-12-01
13 2001-01-01
14 2001-02-01
15 2001-03-01
16 2001-04-01
17 2001-05-01
18 2001-06-01
19 2001-07-01
20 2001-08-01
21 2001-09-01
22 2001-10-01
23 2001-11-01
24 2001-12-01
25 2002-01-01
df2 = structure(list(X1 = c(-0.0712460200169048, 1.0131741924359, 0.28590272354409,
-0.835911047943257, -0.146890264431744), X2 = c(-0.0712460200169048,
1.0131741924359, 0.28590272354409, -0.835911047943257, -0.146890264431744
), Date = structure(c(10984, 11120, 11441, 11488, 11712), class = "Date")), class = "data.frame", row.names = c(NA,
-5L))
X1 X2 Date
1 0.03815189 0.03815189 2000-01-28
2 -0.22665838 -0.22665838 2000-06-12
3 0.36459588 0.36459588 2001-04-29
4 0.32772746 0.32772746 2001-06-15
5 -1.22891784 -1.22891784 2002-01-25
我想做的是根据数量减少df1
中的行数(df1
中的行数= df2
中的行数) df2
中的行数。特别是,我想删除df1
的Date列中df2
的Date列中的那些行。更容易看到我想要的输出:
# DF1 shall become like this (n stays for the numbers corresponding to each date row):
X1 X2 Date
1 n n 2000-01-01
2 n n 2000-06-01
3 n n 2001-04-01
4 n n 2001-06-01
5 n n 2002-01-01
# not really important which day is diplayed in the finale output. What matters is just year and month
我尝试使用semin_join
,但是问题是,不同的日子使函数无法掌握我的需求。理想情况下,我需要忽略天数,并按年份和月份进行采样。
这是我尝试过的:
library(dplyr)
semin_join(df1, df2, by = "Date")
[1] X1 X2 Date
<0 rows> (or 0-length row.names)
有人可以帮助我吗?
谢谢!
答案 0 :(得分:1)
在这里,使用@ arg0naut91的出色建议可能是base R
中的解决方案。首先设置变量Date
的格式,然后可以使用%in%
检查存在或不存在的日期。接下来,使用您的df1
和df2
进行编码:
#Format dates
df1$I1 <- format(df1$Date,'%Y-%m')
df2$I2 <- format(df2$Date,'%Y-%m')
现在,形成对比:
df1[df1$I1 %in% df2$I2,]
输出:
X1 X2 Date I1
1 0.6595885 0.6595885 2000-01-01 2000-01
6 1.0934175 1.0934175 2000-06-01 2000-06
16 1.3088452 1.3088452 2001-04-01 2001-04
18 0.9715070 0.9715070 2001-06-01 2001-06
25 -0.7040516 -0.7040516 2002-01-01 2002-01
最后,您可以将该结果分配给新的数据框并删除I1
。