如何通过每个ID的多列联接表

时间:2019-07-01 13:01:46

标签: r join grouping

我被迫删除上一个主题,因为没有很好地询问它,并且该示例有点复杂,所以这里是一个简单的主题。

我有2个数据框:

DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))

> DF1
  id1 client_code id2 value1 value2
1   1          x1   a    0.1    1.1
2   1          x1   b    0.2    1.2
3   1          x1   c    0.3    1.3
4   1          x2   d    0.4    1.4
5   1          x2   e    0.5    1.5
6   2          x3   y    0.6    1.6

DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))

> DF2
  id1 id2 value1 value2
1   1   a     10     20
2   1   b     11     21
3   1   c     12     22
4   1   d     13     23
5   1   e     14     24
6   1   f     15     25
7   2   x     16     26
8   2   y     17     27

每个客户都属于由(id1)列标识的一组客户

我想做的是添加DF2中缺少的行,它们的id2在DF1中不存在。应该对属于DF1中同一组客户端(id1)的每个客户端(client_code)进行此过程。

(我不知道自己是否足够清楚)

所需的输出:

output<-data.frame(id1=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2),client_code=c("x1","x1","x1","x1","x1","x1","x2","x2","x2","x2","x2","x2","x3","x3"),id2=c("a","b","c","d","e","f","d","e","a","b","c","f","y","x"),                  value1=c(0.1,0.2,0.3,13,14,15,0.4,0.5,10,11,12,15,0.6,16),value2=c(1.1,1.2,1.3,23,24,25,1.4,1.5,20,21,22,25,1.6,26))

> output
   id1 client_code id2 value1 value2
1    1          x1   a    0.1    1.1
2    1          x1   b    0.2    1.2
3    1          x1   c    0.3    1.3
4    1          x1   d   13.0   23.0
5    1          x1   e   14.0   24.0
6    1          x1   f   15.0   25.0
7    1          x2   d    0.4    1.4
8    1          x2   e    0.5    1.5
9    1          x2   a   10.0   20.0
10   1          x2   b   11.0   21.0
11   1          x2   c   12.0   22.0
12   1          x2   f   15.0   25.0
13   2          x3   y    0.6    1.6
14   2          x3   x   16.0   26.0

谢谢。

4 个答案:

答案 0 :(得分:4)

首先,创建一个表,为每个客户端显示id2的值 在场:

library(tidyverse)

client_defaults <- df1 %>% 
  distinct(client_code, id1) %>% 
  left_join(df2, by = "id1")

client_defaults
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a     10     20
#> 2           x1   1   b     11     21
#> 3           x1   1   c     12     22
#> 4           x1   1   d     13     23
#> 5           x1   1   e     14     24
#> 6           x1   1   f     15     25
#> 7           x2   1   a     10     20
#> 8           x2   1   b     11     21
#> 9           x2   1   c     12     22
#> 10          x2   1   d     13     23
#> 11          x2   1   e     14     24
#> 12          x2   1   f     15     25
#> 13          x3   2   x     16     26
#> 14          x3   2   y     17     27

然后,删除df1中已经存在的行,然后添加其余的行:

client_missing <- client_defaults %>% 
  anti_join(df1, by = c("client_code", "id2"))

bind_rows(df1, client_missing) %>% arrange(client_code)
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a    0.1    1.1
#> 2           x1   1   b    0.2    1.2
#> 3           x1   1   c    0.3    1.3
#> 4           x1   1   d   13.0   23.0
#> 5           x1   1   e   14.0   24.0
#> 6           x1   1   f   15.0   25.0
#> 7           x2   1   d    0.4    1.4
#> 8           x2   1   e    0.5    1.5
#> 9           x2   1   a   10.0   20.0
#> 10          x2   1   b   11.0   21.0
#> 11          x2   1   c   12.0   22.0
#> 12          x2   1   f   15.0   25.0
#> 13          x3   2   y    0.6    1.6
#> 14          x3   2   x   16.0   26.0

数据:

df1 <- data.frame(client_code = c("x1", "x1", "x1", "x2", "x2", "x3"), id1 = c(1, 1, 1, 1, 1, 2), id2 = c("a", "b", "c", "d", "e", "y"), value1 = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), value2 = c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6), stringsAsFactors = FALSE)

df2 <- data.frame(id1 = c(1, 1, 1, 1, 1, 1, 2, 2), id2 = c("a", "b", "c", "d", "e", "f", "x", "y"), value1 = c(10, 11, 12, 13, 14, 15, 16, 17), value2 = c(20, 21, 22, 23, 24, 25, 26, 27), stringsAsFactors = FALSE)

https://developer.android.com/training/permissions/requesting.html(v0.2.1)于2019-07-01创建

答案 1 :(得分:2)

这是一个可能的data.table解决方案,这更加简洁。本质上,它执行两个步骤:

  1. 通过在DF1DF2中的每个组上加入id1client_code来构建完整的数据表。
  2. 通过相应的value1值更新value2DF1
library(data.table)

setDT(DF1); setDT(DF2)

DF <- DF1[, DF2[.BY, .SD, on = "id1", .SDcols = id2:value2], by = .(id1, client_code)]
DF[DF1, `:=`(value1 = i.value1, value2 = i.value2), on = c("id1", "client_code", "id2")]

DF
#>     id1 client_code id2 value1 value2
#>  1:   1          x1   a    0.1    1.1
#>  2:   1          x1   b    0.2    1.2
#>  3:   1          x1   c    0.3    1.3
#>  4:   1          x1   d   13.0   23.0
#>  5:   1          x1   e   14.0   24.0
#>  6:   1          x1   f   15.0   25.0
#>  7:   1          x2   a   10.0   20.0
#>  8:   1          x2   b   11.0   21.0
#>  9:   1          x2   c   12.0   22.0
#> 10:   1          x2   d    0.4    1.4
#> 11:   1          x2   e    0.5    1.5
#> 12:   1          x2   f   15.0   25.0
#> 13:   2          x3   x   16.0   26.0
#> 14:   2          x3   y    0.6    1.6

reprex package(v0.3.0)于2019-07-01创建

答案 2 :(得分:2)

这是使用sql查询的解决方案。

library(sqldf)

sqldf('
select  distinct
        a.id1
        , a.client_code
        , b.id2
        , coalesce(d.value1, e.value1) as value1
        , coalesce(d.value2, e.value2) as value2
from    df1 a
        left join df2 b
          on  a.id1 = b.id1
        left join df1 d
          on  a.id1 = d.id1
              and b.id2 = d.id2
              and a.client_code = d.client_code
        left join df2 e
          on  a.id1 = e.id1
              and b.id2 = e.id2
')

#    id1 client_code id2 value1 value2
# 1    1          x1   a    0.1    1.1
# 2    1          x1   b    0.2    1.2
# 3    1          x1   c    0.3    1.3
# 4    1          x1   d   13.0   23.0
# 5    1          x1   e   14.0   24.0
# 6    1          x1   f   15.0   25.0
# 7    1          x2   a   10.0   20.0
# 8    1          x2   b   11.0   21.0
# 9    1          x2   c   12.0   22.0
# 10   1          x2   d    0.4    1.4
# 11   1          x2   e    0.5    1.5
# 12   1          x2   f   15.0   25.0
# 13   2          x3   x   16.0   26.0
# 14   2          x3   y    0.6    1.6

答案 3 :(得分:1)

另一个选择是检查DF2中的每一行。

我们重新创建您的表,并将某些列设置为字符:

library(data.table)

DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
DF1$id2 <- as.character(DF1$id2)

DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
DF2$id2 <- as.character(DF2$id2)

然后我们保存DF1的列顺序(以后将需要此列)

column_order <- colnames(DF1)

现在我们将每个客户代码分开

client_codes <- as.character(unique(DF1$client_code))

然后我们定义了New表,它将包含最终结果作为数据框

New_Table <- data.frame()

现在,我们创建一个嵌套的for循环,以考虑不同的客户端代码,并检查DF2中的每一行。

for(i in client_codes){
#So here we subset the DF1 data frame to only a certain client code  
New_DF1 <- DF1[DF1$client_code == i,]

temp_id <- unique(New_DF1$id1)
#and here we select the correct ids
New_DF2 <- DF2[DF2$id1 %in% temp_id,]

temp_df <- data.frame()
#And with this second for loop we perform a check on each of the new_DF2 rows
#to see if they have a matching client id
for(j in 1:nrow(New_DF2)){

  temp_row <- New_DF2[j,]
  if(nrow((New_DF1[New_DF1$id1 == temp_row$id1 & New_DF1$id2 == temp_row$id2,])) == 0){
    temp_row$client_code <- i
    setcolorder(temp_row, column_order)
    temp_df <- rbind(temp_df, temp_row)
  }

}

New_DF1 <- rbind(New_DF1, temp_df)

New_Table <- rbind(New_Table, New_DF1)

}