我有2个数据框df1
和df2
。
df1
包含2列 - t1
和data1
,其中t1
从0.0001开始到75,增量为0.0001。所以它像0.0001,0.0002,0.0003 ...... 74.9999,75.0000。 data1只是0到1之间的一些数字。
df2
还包含2列 - t2
和data2
,但每列的长度为114 - 时间列中仅存在0.0001到75之间的选定值 - 例如。 14.6000,15.2451,...... 73.4568。 data2也是一些长度为114的随机数
我从另一个数据集中提取了t2的值
t2<- c(14.6000, 14.6001, 14.6002, 14.6002, 14.6007, 14.6011, 14.6016, 14.602, 14.6037, 14.6055, 14.6072, 14.6089, 14.6151, 14.6214, 14.6277, 14.6339, 14.6402, 14.6545, 14.6688, 14.6831, 14.6974, 14.7117, 14.7261, 14.7573, 14.7886, 14.8199, 14.8511, 14.8824, 14.9137, 14.9681, 15.0225, 15.0768, 15.1312, 15.1856, 15.24, 15.3233, 15.4065, 15.4897, 15.573, 15.6562, 15.7394, 15.8768, 16.0142, 16.1516, 16.289, 16.4264, 16.5638, 16.7676, 16.9715, 17.1753, 17.3792, 17.583, 17.7868, 17.9907, 18.3366, 18.6826, 19.0285, 19.3745, 19.7204, 20.0664, 20.4124, 20.9122, 21.412, 21.9118, 22.4116, 22.9114, 23.4112, 23.911, 24.5965, 25.282, 25.9675, 26.653, 27.3385, 28.024, 29.1158, 30.2075, 31.2993, 32.3911, 33.4828, 34.6828, 35.8828, 37.0828, 38.2828, 39.4828, 40.6828, 41.8828, 43.0828, 44.2828, 45.4828, 46.6828, 47.8828, 49.0828, 50.2828, 51.4828, 52.6828, 53.8828, 55.0828, 56.2828, 57.4828, 58.6828, 59.8828, 61.0828, 62.2828, 63.4828, 64.6828, 65.8828, 67.0828, 68.2828, 69.4828, 70.6828, 71.8828, 73.0828, 74.2828,74.6000)
df1<- data.frame("t1"=seq(0.0001,75,0.0001), "data1"=c(rnorm(750000)))
df2<- data.frame("t2"=t2, "data2"=c(rnorm(length(t2))))
我想创建一个新的数据框 - df_new
,其中我想从t2
data1
的值和相应的df1
值
df_new<- subset(df1,t1 %in% df2$t2)
当我这样做时,df_new
只有74个观察值,而不是114个。我在这里做错了吗?
答案 0 :(得分:1)
这似乎是浮点运算的一个问题。请参阅以下两个示例。一般来说,直接比较像这样的浮点数并不一定是稳健的,因为表示的精度并不完美。我选择了df2$t2
中没有按预期排列的第一个元素。您希望第一个==
比较会返回true,但它不会。看到all.equal
,其中令人困惑的测试&#34;接近相等&#34;,实际上对我拉出的两个对象都返回true。您可以通过更改使用options
打印的数字来确定存在差异。
获得预期结果的一种方法是使用round
使您想要的所有数字相同。请注意,输出中只有113行,因为df2$t2
中只提供了113个唯一值。您也可以考虑转换为整数(相应的较小单位)。
t2<- c(14.6000, 14.6001, 14.6002, 14.6002, 14.6007, 14.6011, 14.6016, 14.602, 14.6037, 14.6055, 14.6072, 14.6089, 14.6151, 14.6214, 14.6277, 14.6339, 14.6402, 14.6545, 14.6688, 14.6831, 14.6974, 14.7117, 14.7261, 14.7573, 14.7886, 14.8199, 14.8511, 14.8824, 14.9137, 14.9681, 15.0225, 15.0768, 15.1312, 15.1856, 15.24, 15.3233, 15.4065, 15.4897, 15.573, 15.6562, 15.7394, 15.8768, 16.0142, 16.1516, 16.289, 16.4264, 16.5638, 16.7676, 16.9715, 17.1753, 17.3792, 17.583, 17.7868, 17.9907, 18.3366, 18.6826, 19.0285, 19.3745, 19.7204, 20.0664, 20.4124, 20.9122, 21.412, 21.9118, 22.4116, 22.9114, 23.4112, 23.911, 24.5965, 25.282, 25.9675, 26.653, 27.3385, 28.024, 29.1158, 30.2075, 31.2993, 32.3911, 33.4828, 34.6828, 35.8828, 37.0828, 38.2828, 39.4828, 40.6828, 41.8828, 43.0828, 44.2828, 45.4828, 46.6828, 47.8828, 49.0828, 50.2828, 51.4828, 52.6828, 53.8828, 55.0828, 56.2828, 57.4828, 58.6828, 59.8828, 61.0828, 62.2828, 63.4828, 64.6828, 65.8828, 67.0828, 68.2828, 69.4828, 70.6828, 71.8828, 73.0828, 74.2828,74.6000)
set.seed(12345)
df1<- data.frame("t1"=seq(0.0001,75,0.0001), "data1"=c(rnorm(750000)))
df2<- data.frame("t2"= t2, "data2"=c(rnorm(length(t2))))
df2$t2[2]
#> [1] 14.6001
df1$t1[146001]
#> [1] 14.6001
df1$t1[146001] == df2$t2[2]
#> [1] FALSE
all.equal(df1$t1[146001], df2$t2[2])
#> [1] TRUE
options(digits = 22)
df2$t2[2]
#> [1] 14.600099999999999
df1$t1[146001]
#> [1] 14.600100000000001
df_new_rnd <- subset(df1, round(t1, 4) %in% round(df2$t2, 4))
df_new_int <- subset(df1, as.integer(t1 * 10000) %in% as.integer(df2$t2 * 10000))
nrow(df_new_rnd)
#> [1] 113
nrow(df_new_int)
#> [1] 113
由reprex package(v0.2.0)创建于2018-05-22。