如何在设置匹配阈值的同时基于数字数据行名称合并两个数据框?
df1 <- structure(list(c(4974622.505928, 170582.149747, 130545.004516,
143528.819582, 49416.594892, 51879.515558, 52027.462651, 42491.317116,
49173.145029, 44040.01261), c(4664319.00309, 266278.599338, 204772.412837,
204819.210688, 77718.961761, 82742.852809, 79706.774944, 67123.603629,
67264.401059, 66750.260768), c(5906075.502923, 385318.121061,
296824.944672, 308432.753482, 113407.50333, 120352.400266, 122622.356104,
98656.179336, 107669.002489, 100262.855064), c(5401712.020682,
204595.653994, 163485.509823, 179567.339348, 62690.116298, 63790.0244,
64660.971879, 52545.84055, 59080.66972, 54579.538267), c(5273676.522307,
159130.126808, 129607.971309, 142279.787439, 45812.561022, 47230.447746,
48367.405274, 39578.235275, 45489.065198, 43102.923417)), row.names = c("34.9816256",
"35.0576674", "35.0898006", "35.1270264", "35.1738664", "35.1936282",
"35.2043582", "35.2359934", "35.2716016", "35.2993064"), class = "data.frame")
df2 <- structure(list(c(5898584.48405, 302326.226264, 185567.968257,
205617.778019, 84476.66928, 65505.560486, 68121.465276, 63221.947902,
55028.866127, 36821.607091), c(3719350.766633, 108177.577417,
68855.378083, 78201.248427, 17558.118703, 23387.078772, 25374.978916,
18833.579115, 12761.529092, 11507.348928), c(3587498.99736, 96793.741428,
59750.485295, 70217.309923, 26233.188472, 20200.080468, 22241.999451,
20268.485836, 17330.391134, 12503.133961), c(3128479.008712,
70298.795438, 45668.592667, 56013.453832, 20323.368372, 16795.27218,
16358.208042, 15722.790712, 12276.726458, 9155.522864), c(3847005.494149,
138762.296854, 94196.099405, 106888.964213, 36614.870588, 30856.787329,
33880.704043, 31399.328936, 27819.255931, 18560.05768)), row.names = c("34.9815906",
"35.0356588", "35.0897702", "35.1269978", "35.1535182", "35.1744048",
"35.1952968", "35.3032464", "35.3207828", "35.3739834"), class = "data.frame")
数据帧的输出(第一行是行名)
> df1
34.9816256 4974622.51 4664319.00 5906075.50 5401712.02 5273676.52
35.0576674 170582.15 266278.60 385318.12 204595.65 159130.13
35.0898006 130545.00 204772.41 296824.94 163485.51 129607.97
35.1270264 143528.82 204819.21 308432.75 179567.34 142279.79
35.1738664 49416.59 77718.96 113407.50 62690.12 45812.56
35.1936282 51879.52 82742.85 120352.40 63790.02 47230.45
35.2043582 52027.46 79706.77 122622.36 64660.97 48367.41
35.2359934 42491.32 67123.60 98656.18 52545.84 39578.24
35.2716016 49173.15 67264.40 107669.00 59080.67 45489.07
35.2993064 44040.01 66750.26 100262.86 54579.54 43102.92
> df2
34.9815906 5898584.48 3719350.77 3587499.00 3128479.009 3847005.49
35.0356588 302326.23 108177.58 96793.74 70298.795 138762.30
35.0897702 185567.97 68855.38 59750.49 45668.593 94196.10
35.1269978 205617.78 78201.25 70217.31 56013.454 106888.96
35.1535182 84476.67 17558.12 26233.19 20323.368 36614.87
35.1744048 65505.56 23387.08 20200.08 16795.272 30856.79
35.1952968 68121.47 25374.98 22242.00 16358.208 33880.70
35.3032464 63221.95 18833.58 20268.49 15722.791 31399.33
35.3207828 55028.87 12761.53 17330.39 12276.726 27819.26
35.3739834 36821.61 11507.35 12503.13 9155.523 18560.06
如果行名称中两个数字之间的差异在[-0.02,0.02]之间,我想根据它们的行名称合并这两个数据集
换句话说,应该将df1中的每个行名与df2中的每个行名进行比较,如果发现两个行名之间的差落在[-0.02,0.02]范围内,则可以将数据合并到同一行上。如果找不到匹配项,则将NA添加到其他df没有匹配数据的地方(如full_join)。
答案 0 :(得分:0)
您需要列名称,并将rownames
添加为列
library(tibble)
colnames(df1) <- c('a1', 'b1', 'c1', 'd1', 'e1')
df1 <- rownames_to_column(df1, "rn1")
colnames(df2) <- c('a2', 'b2', 'c2', 'd2', 'e2')
df2 <- rownames_to_column(df2, "rn2")
连接两个数据帧
df3 <- cbind(df1, df2)
计算rownames
df3['diff'] <- as.numeric(df3$rn1) - as.numeric(df3$rn2)
过滤并删除不需要的列
library(tidyverse)
df4 <- df3 %>%
filter(diff >= -0.02 & diff <= 0.02) %>%
select(-c(rn1, rn2, diff))
# a1 b1 c1 d1 e1 a2 b2 c2 d2 e2
#1 4974622.51 4664319.00 5906075.5 5401712.02 5273676.52 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49
#2 130545.00 204772.41 296824.9 163485.51 129607.97 185567.97 68855.38 59750.49 45668.59 94196.10
#3 143528.82 204819.21 308432.8 179567.34 142279.79 205617.78 78201.25 70217.31 56013.45 106888.96
#4 51879.52 82742.85 120352.4 63790.02 47230.45 65505.56 23387.08 20200.08 16795.27 30856.79
#5 52027.46 79706.77 122622.4 64660.97 48367.41 68121.47 25374.98 22242.00 16358.21 33880.70
答案 1 :(得分:0)
您可以使用foverlaps
包中的data.table
library(data.table)
#add column names to sample data as it's NULL currently
names(df1) <- paste0("df1_", 1:ncol(df1))
names(df2) <- paste0("df2_", 1:ncol(df2))
#convert rownames as first column
setDT(df1, keep.rownames = TRUE)[]
setnames(df1, 1, "df1_rn")
setDT(df2, keep.rownames = TRUE)[]
setnames(df2, 1, "df2_rn")
#add temporary columns to both data tables
df1[, `:=`(df1_rn = as.numeric(df1_rn), temp = as.numeric(df1_rn))]
df2[, `:=`(df2_rn_minus_2 = as.numeric(df2_rn) - 0.02, df2_rn_plus_2 = as.numeric(df2_rn) + 0.02)]
setkey(df2, df2_rn_minus_2, df2_rn_plus_2)
DT = foverlaps(df1, df2, by.x = c("df1_rn", "temp"))[, !c("df2_rn_minus_2", "df2_rn_plus_2", "temp"), with = F]
给出
> DT
df2_rn df2_1 df2_2 df2_3 df2_4 df2_5 df1_rn df1_1 df1_2 df1_3
1: 34.9815906 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 34.98163 4974622.51 4664319.00 5906075.50
2: <NA> NA NA NA NA NA 35.05767 170582.15 266278.60 385318.12
3: 35.0897702 185567.97 68855.38 59750.49 45668.59 94196.10 35.08980 130545.00 204772.41 296824.94
4: 35.1269978 205617.78 78201.25 70217.31 56013.45 106888.96 35.12703 143528.82 204819.21 308432.75
5: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.17387 49416.59 77718.96 113407.50
6: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.19363 51879.52 82742.85 120352.40
7: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.19363 51879.52 82742.85 120352.40
8: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.20436 52027.46 79706.77 122622.36
9: <NA> NA NA NA NA NA 35.23599 42491.32 67123.60 98656.18
10: <NA> NA NA NA NA NA 35.27160 49173.15 67264.40 107669.00
11: 35.3032464 63221.95 18833.58 20268.49 15722.79 31399.33 35.29931 44040.01 66750.26 100262.86
df1_4 df1_5
1: 5401712.02 5273676.52
2: 204595.65 159130.13
3: 163485.51 129607.97
4: 179567.34 142279.79
5: 62690.12 45812.56
6: 63790.02 47230.45
7: 63790.02 47230.45
8: 64660.97 48367.41
9: 52545.84 39578.24
10: 59080.67 45489.07
11: 54579.54 43102.92