如何在R中基于列的数据帧相交

时间:2020-03-17 06:31:14

标签: r dataframe

我有数据1

    BAKUNG  BALATAK    BARUGA  BERINGIN CEMERLANG
1 5.397762 4.4421650 1.1371410 0.1915185 1.2023105
2 4.551889 1.1710558 0.6596748 2.2443573 5.5094816
3 9.290642 1.9318098 0.3717321 0.7481191 2.3554114
4 1.758246 1.6478570 1.1930006 0.7687339 1.5786976
5 2.497720 0.5127110 0.3331624 0.3225225 0.7541041
6 3.080921 0.6563498 1.8712953 1.0959114 1.1883456

和数据2

             BAKUNG           BALATAK   CEMERLANG
1              <NA>              <NA>        <NA>
2              <NA>                 2        <NA>
3                14                 6        <NA>
4                17              <NA>           1
5              <NA>              <NA>        <NA>
6              <NA>              <NA>        <NA>
  AMPANA TETE AMPIBABO
1        <NA>     <NA>
2        <NA>     <NA>
3        <NA>     <NA>
4        <NA>     <NA>
5        <NA>     <NA>
6        <NA>     <NA>

我想从具有相同站名的data1和data2中找到数据站。我想在这里获取数据

    BAKUNG_data1 BAKUNG_data2 BALATAK_data1 BALATAK_data2
 1       5.397762           NA     4.4421650            NA
 2       4.551889           NA     1.1710558             2
 3       9.290642           14     1.9318098             6
 4       1.758246           17     1.6478570            NA
 5       2.497720           NA     0.5127110            NA
 6       3.080921           NA     0.6563498            NA

我尝试过

abc <- merge(data1,data2, by = intersect(names(data1), names(data2)))

但是我一无所获,有什么办法解决吗?

4 个答案:

答案 0 :(得分:3)

一种方法是获取长格式的数据,然后进行联接,最后获取宽格式的数据。

library(dplyr) 
library(tidyr)

inner_join(data1 %>% 
            mutate(row = row_number()) %>%
            pivot_longer(cols = -row, values_to = "data1"),
           data2 %>% 
            mutate(row = row_number()) %>%
            pivot_longer(cols = -row, values_to = "data2"), 
           by = c('name', 'row')) %>%
   pivot_wider(names_from = name, values_from = starts_with('data')) %>%
  select(-row)

答案 1 :(得分:2)

如果您愿意更改列名

tmp=intersect(names(data1), names(data2))
colnames(data1)=paste0(colnames(data1),"_data1")
colnames(data2)=paste0(colnames(data2),"_data2")
cbind(data1[grepl(paste0("^",tmp,".*",collapse="|"),colnames(data1))],
      data2[grepl(paste0("^",tmp,".*",collapse="|"),colnames(data2))])

  BAKUNG_data1 BALATAK_data1 CEMERLANG_data1 BAKUNG_data2 BALATAK_data2 CEMERLANG_data2
1     5.397762     4.4421650       1.2023105         <NA>          <NA>            <NA>
2     4.551889     1.1710558       5.5094816         <NA>             2            <NA>
3     9.290642     1.9318098       2.3554114           14             6            <NA>
4     1.758246     1.6478570       1.5786976           17          <NA>               1
5     2.497720     0.5127110       0.7541041         <NA>          <NA>            <NA>
6     3.080921     0.6563498       1.1883456         <NA>          <NA>            <NA>

答案 2 :(得分:1)

我添加了一个“ INDEX”列以执行连接。

library(dplyr)
library(tidyverse) 

data1 <- read.table(textConnection("
BAKUNG  BALATAK    BARUGA  BERINGIN CEMERLANG
5.397762 4.4421650 1.1371410 0.1915185 1.2023105
4.551889 1.1710558 0.6596748 2.2443573 5.5094816
9.290642 1.9318098 0.3717321 0.7481191 2.3554114
1.758246 1.6478570 1.1930006 0.7687339 1.5786976
2.497720 0.5127110 0.3331624 0.3225225 0.7541041
3.080921 0.6563498 1.8712953 1.0959114 1.1883456"), header = TRUE, na.strings = "<NA>")

data1 <- as.data.frame(data1)
data1 <- data1 %>% mutate(INDEX = row_number())

data2 <- read.table(textConnection("
BAKUNG BALATAK CEMERLANG AMPANA_TETE AMPIBABO
<NA> <NA> <NA> <NA> <NA>
<NA> 2 <NA> <NA> <NA>
14 6 <NA> <NA> <NA>
17 <NA> 1 <NA> <NA>
<NA> <NA> <NA> <NA> <NA>
<NA> <NA> <NA> <NA> <NA>"), header = TRUE, na.strings = "<NA>")

data2 <- as.data.frame(data2)
data2 <- data2 %>% mutate(INDEX = row_number())

data3 <- inner_join(data1, data2, by = c("INDEX", "INDEX"), suffix = c("_data1", "_data2"))
data3 <- data3 %>% select(contains("data"))

答案 3 :(得分:1)

Base R解决方案:

# Store a vector of the names of vectors common to both dataframes: 

common_cols <- intersect(names(df1), names(df2))

# Column bind the dataframes: 

df3 <- cbind(setNames(df1[,common_cols], paste0(common_cols, "_data1")), 
      setNames(df2[,common_cols],  paste0(common_cols, "_data2")))

# Order the dataframe as required

df3_ordered <- df3[,sort(names(df3))]

数据:

df1 <-
  structure(
    list(
      BAKUNG = c(5.397762, 4.551889, 9.290642,
                 1.758246, 2.49772, 3.080921),
      BALATAK = c(4.442165, 1.1710558,
                  1.9318098, 1.647857, 0.512711, 0.6563498),
      BARUGA = c(1.137141,
                 0.6596748, 0.3717321, 1.1930006, 0.3331624, 1.8712953),
      BERINGIN = c(
        0.1915185,
        2.2443573,
        0.7481191,
        0.7687339,
        0.3225225,
        1.0959114
      ),
      CEMERLANG = c(
        1.2023105,
        5.5094816,
        2.3554114,
        1.5786976,
        0.7541041,
        1.1883456
      )
    ),
    class = "data.frame",
    row.names = c(NA,-6L)
  )

df2 <-
  structure(
    list(
      BAKUNG = c("<NA>", "<NA>", "14", "17",
                 "<NA>", "<NA>"),
      BALATAK = c("<NA>", "2", "6", "<NA>", "<NA>",
                  "<NA>"),
      CEMERLANG = c("<NA>", "<NA>", "<NA>", "1", "<NA>", "<NA>")
    ),
    class = "data.frame",
    row.names = c(NA,-6L)
  )