根据所选阈值内的数字行名称合并数据帧,并同时保留不匹配的行

时间:2018-06-22 13:11:24

标签: r join merge rowname

如何在设置匹配阈值的同时基于数字数据行名称合并两个数据框?

df1 <- structure(list(c(4974622.505928, 170582.149747, 130545.004516, 
143528.819582, 49416.594892, 51879.515558, 52027.462651, 42491.317116, 
49173.145029, 44040.01261), c(4664319.00309, 266278.599338, 204772.412837, 
204819.210688, 77718.961761, 82742.852809, 79706.774944, 67123.603629, 
67264.401059, 66750.260768), c(5906075.502923, 385318.121061, 
296824.944672, 308432.753482, 113407.50333, 120352.400266, 122622.356104, 
98656.179336, 107669.002489, 100262.855064), c(5401712.020682, 
204595.653994, 163485.509823, 179567.339348, 62690.116298, 63790.0244, 
64660.971879, 52545.84055, 59080.66972, 54579.538267), c(5273676.522307, 
159130.126808, 129607.971309, 142279.787439, 45812.561022, 47230.447746, 
48367.405274, 39578.235275, 45489.065198, 43102.923417)), row.names = c("34.9816256", 
"35.0576674", "35.0898006", "35.1270264", "35.1738664", "35.1936282", 
"35.2043582", "35.2359934", "35.2716016", "35.2993064"), class = "data.frame")

df2 <- structure(list(c(5898584.48405, 302326.226264, 185567.968257, 
205617.778019, 84476.66928, 65505.560486, 68121.465276, 63221.947902, 
55028.866127, 36821.607091), c(3719350.766633, 108177.577417, 
68855.378083, 78201.248427, 17558.118703, 23387.078772, 25374.978916, 
18833.579115, 12761.529092, 11507.348928), c(3587498.99736, 96793.741428, 
59750.485295, 70217.309923, 26233.188472, 20200.080468, 22241.999451, 
20268.485836, 17330.391134, 12503.133961), c(3128479.008712, 
70298.795438, 45668.592667, 56013.453832, 20323.368372, 16795.27218, 
16358.208042, 15722.790712, 12276.726458, 9155.522864), c(3847005.494149, 
138762.296854, 94196.099405, 106888.964213, 36614.870588, 30856.787329, 
33880.704043, 31399.328936, 27819.255931, 18560.05768)), row.names = c("34.9815906", 
"35.0356588", "35.0897702", "35.1269978", "35.1535182", "35.1744048", 
"35.1952968", "35.3032464", "35.3207828", "35.3739834"), class = "data.frame")

数据帧的输出(第一行是行名

> df1
34.9816256 4974622.51 4664319.00 5906075.50 5401712.02 5273676.52
35.0576674  170582.15  266278.60  385318.12  204595.65  159130.13
35.0898006  130545.00  204772.41  296824.94  163485.51  129607.97
35.1270264  143528.82  204819.21  308432.75  179567.34  142279.79
35.1738664   49416.59   77718.96  113407.50   62690.12   45812.56
35.1936282   51879.52   82742.85  120352.40   63790.02   47230.45
35.2043582   52027.46   79706.77  122622.36   64660.97   48367.41
35.2359934   42491.32   67123.60   98656.18   52545.84   39578.24
35.2716016   49173.15   67264.40  107669.00   59080.67   45489.07
35.2993064   44040.01   66750.26  100262.86   54579.54   43102.92

> df2
34.9815906 5898584.48 3719350.77 3587499.00 3128479.009 3847005.49
35.0356588  302326.23  108177.58   96793.74   70298.795  138762.30
35.0897702  185567.97   68855.38   59750.49   45668.593   94196.10
35.1269978  205617.78   78201.25   70217.31   56013.454  106888.96
35.1535182   84476.67   17558.12   26233.19   20323.368   36614.87
35.1744048   65505.56   23387.08   20200.08   16795.272   30856.79
35.1952968   68121.47   25374.98   22242.00   16358.208   33880.70
35.3032464   63221.95   18833.58   20268.49   15722.791   31399.33
35.3207828   55028.87   12761.53   17330.39   12276.726   27819.26
35.3739834   36821.61   11507.35   12503.13    9155.523   18560.06

如果行名称中两个数字之间的差异在[-0.02,0.02]之间,我想根据它们的行名称合并这两个数据集

换句话说,应该将df1中的每个行名与df2中的每个行名进行比较,如果发现两个行名之间的差落在[-0.02,0.02]范围内,则可以将数据合并到同一行上。如果找不到匹配项,则将NA添加到其他df没有匹配数据的地方(如full_join)。

2 个答案:

答案 0 :(得分:0)

  1. 您需要列名称,并将rownames添加为列

    library(tibble)
    colnames(df1) <- c('a1', 'b1', 'c1', 'd1', 'e1')
    df1 <- rownames_to_column(df1, "rn1")
    
    colnames(df2) <- c('a2', 'b2', 'c2', 'd2', 'e2')
    df2 <- rownames_to_column(df2, "rn2")
    
  2. 连接两个数据帧

    df3 <- cbind(df1, df2)
    
  3. 计算rownames

    之间的差异
    df3['diff'] <- as.numeric(df3$rn1) - as.numeric(df3$rn2)
    
  4. 过滤并删除不需要的列

    library(tidyverse)
    df4 <- df3 %>%
       filter(diff >= -0.02 & diff <= 0.02) %>%
       select(-c(rn1, rn2, diff))
    
    #          a1         b1        c1         d1         e1         a2         b2         c2         d2         e2
    #1 4974622.51 4664319.00 5906075.5 5401712.02 5273676.52 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49
    #2  130545.00  204772.41  296824.9  163485.51  129607.97  185567.97   68855.38   59750.49   45668.59   94196.10
    #3  143528.82  204819.21  308432.8  179567.34  142279.79  205617.78   78201.25   70217.31   56013.45  106888.96
    #4   51879.52   82742.85  120352.4   63790.02   47230.45   65505.56   23387.08   20200.08   16795.27   30856.79
    #5   52027.46   79706.77  122622.4   64660.97   48367.41   68121.47   25374.98   22242.00   16358.21   33880.70
    

答案 1 :(得分:0)

您可以使用foverlaps包中的data.table

library(data.table)

#add column names to sample data as it's NULL currently
names(df1) <- paste0("df1_", 1:ncol(df1))
names(df2) <- paste0("df2_", 1:ncol(df2))

#convert rownames as first column
setDT(df1, keep.rownames = TRUE)[]
setnames(df1, 1, "df1_rn")
setDT(df2, keep.rownames = TRUE)[]
setnames(df2, 1, "df2_rn")

#add temporary columns to both data tables
df1[, `:=`(df1_rn = as.numeric(df1_rn), temp = as.numeric(df1_rn))]
df2[, `:=`(df2_rn_minus_2 = as.numeric(df2_rn) - 0.02, df2_rn_plus_2 = as.numeric(df2_rn) + 0.02)]
setkey(df2, df2_rn_minus_2, df2_rn_plus_2)

DT = foverlaps(df1, df2, by.x = c("df1_rn", "temp"))[, !c("df2_rn_minus_2", "df2_rn_plus_2", "temp"), with = F]

给出

> DT
        df2_rn      df2_1      df2_2      df2_3      df2_4      df2_5   df1_rn      df1_1      df1_2      df1_3
 1: 34.9815906 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 34.98163 4974622.51 4664319.00 5906075.50
 2:       <NA>         NA         NA         NA         NA         NA 35.05767  170582.15  266278.60  385318.12
 3: 35.0897702  185567.97   68855.38   59750.49   45668.59   94196.10 35.08980  130545.00  204772.41  296824.94
 4: 35.1269978  205617.78   78201.25   70217.31   56013.45  106888.96 35.12703  143528.82  204819.21  308432.75
 5: 35.1744048   65505.56   23387.08   20200.08   16795.27   30856.79 35.17387   49416.59   77718.96  113407.50
 6: 35.1744048   65505.56   23387.08   20200.08   16795.27   30856.79 35.19363   51879.52   82742.85  120352.40
 7: 35.1952968   68121.47   25374.98   22242.00   16358.21   33880.70 35.19363   51879.52   82742.85  120352.40
 8: 35.1952968   68121.47   25374.98   22242.00   16358.21   33880.70 35.20436   52027.46   79706.77  122622.36
 9:       <NA>         NA         NA         NA         NA         NA 35.23599   42491.32   67123.60   98656.18
10:       <NA>         NA         NA         NA         NA         NA 35.27160   49173.15   67264.40  107669.00
11: 35.3032464   63221.95   18833.58   20268.49   15722.79   31399.33 35.29931   44040.01   66750.26  100262.86
         df1_4      df1_5
 1: 5401712.02 5273676.52
 2:  204595.65  159130.13
 3:  163485.51  129607.97
 4:  179567.34  142279.79
 5:   62690.12   45812.56
 6:   63790.02   47230.45
 7:   63790.02   47230.45
 8:   64660.97   48367.41
 9:   52545.84   39578.24
10:   59080.67   45489.07
11:   54579.54   43102.92