使用混合数据连接多个表

时间:2016-02-06 15:25:38

标签: r dplyr sqldf

我的数据:

data1 <- data.frame(from = c(1, 2, 13, 4),
                    to = c(4, 3, 9, 1),
                    values = c(12, 56, 67, 78)) 

data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4)) 

我的结果:

from to values
   1  4     12
   2  3     56
  13  9     67
   4  1     78

  place id
     NY  1
 London  2
  Brest  3
 Nantes  4

我期望使用dplyr包中的join函数(在新表中)

from      to      values
NY     Nantes     12
London  Brest     56
London     NY     78

我尝试了什么:

 data3<- inner_join (data1, data2, by =c("from" = "id", "to" = "id"))
 data3

一些参考文献:
https://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html

混合数据

的更好示例

考虑到我有50列地理数据(“地点”)和非地理数据(等级,值) 我不想改变我的d.f.列的顺序 我想保留我的专栏名称

   data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
                       value1 = c(4, 3, 9, 1),
                       firstplace = c(1, 2, 13, 4),  
                       secondplace = c(1, 2, 2, 4),
                       value2  = c(78, 3000, 90, 101),
                       thirdplace =c(1, 1, 2, 4),
                       fourthplace=c(4, 4, 4, 4),
                       fifthplace=c(1, 2, 3, 4), 
                       value3 = c(12, 56, 67, 78))

   data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4)) 

具有不同名称的示例(更复杂?)

我不想改变我的d.f.列的顺序 我想保留我的专栏名称

   data1 <- data.frame(levels1 = c("name1", "name2", "name3", "name4"),
                       value1 = c(4, 3, 9, 1),
                       shops= c(1, 2, 13, 4),  
                       after_sales_service = c(1, 2, 2, 4),
                       value2  = c(78, 3000, 90, 101),
                       provider =c(1, 1, 2, 4),
                       seller=c(4, 4, 4, 4),
                       maker=c(1, 2, 3, 4), 
                       value3 = c(12, 56, 67, 78))

   data2 <- data.frame(place = c("NY", "London", "Brest", "Nantes"),
                    id = c(1, 2, 3, 4)) 

2 个答案:

答案 0 :(得分:4)

您可以使用data2作为查找表,而不是加入:

library(dplyr)
data1 <- data1 %>% 
  mutate(from = data2$place[match(from, data2$id)],
         to = data2$place[match(to, data2$id)]) %>%
  filter(complete.cases(.))

给出:

> data1
    from     to values
1     NY Nantes     12
2 London  Brest     56
3 Nantes     NY     78

使用 data.table 包的替代解决方案:

library(data.table)
na.omit(setDT(data1)[, `:=` (from = data2$place[match(from, data2$id)],
                             to = data2$place[match(to, data2$id)])])

你也可以做一个双left_join

data1 %>% 
  left_join(., data2, by = c("from"="id")) %>%
  left_join(., data2, by = c("to"="id")) %>%
  select(-c(1:2)) %>%
  filter(complete.cases(.))

更新1:如果您有多个名称必须匹配的列,最好先将数据帧转换为长格式。更大数据集的示例:

library(dplyr)
library(tidyr)
data1 %>%
  gather(var, val, -values) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-3) %>%
  filter(!is.na(place)) %>% 
  spread(var, place)

给出:

  values fifthplace firstplace fourthplace   from secondplace thirdplace     to
1     12         NY         NY      Nantes     NY          NY         NY Nantes
2     56     London     London      Nantes London      London         NY  Brest
3     67      Brest       <NA>      Nantes   <NA>      London     London   <NA>
4     78     Nantes     Nantes      Nantes Nantes      Nantes     Nantes     NY

使用 data.table ,您可以:

library(data.table)
dcast(melt(setDT(data1),
           id.vars = "values")[data2, on = c(value="id")],
      values ~ variable, value.var = "place")

给你相同的结果。

更新2:为了回应问题的第二次更新,您可以使用 dplyr / tidyr 按照以下方式进行操作:

data1 %>%
  gather(var, val, c(firstplace,secondplace,thirdplace,fourthplace,fifthplace)) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-val) %>%
  spread(var, place)

给出:

  levels1 value1 value2 value3 fifthplace firstplace fourthplace secondplace thirdplace
1   name1      4     78     12         NY         NY      Nantes          NY         NY
2   name2      3   3000     56     London     London      Nantes      London         NY
3   name3      9     90     67      Brest       <NA>      Nantes      London     London
4   name4      1    101     78     Nantes     Nantes      Nantes      Nantes     Nantes

或使用 data.table

mvars <- c("firstplace","secondplace","thirdplace","fourthplace","fifthplace")
dcast(melt(setDT(data1),
           measure.vars = mvars)[data2, on = c(value="id")],
      levels1 + value1 + value2 + value3 ~ variable, value.var = "place")

给出相同的结果:

   levels1 value1 value2 value3 firstplace secondplace thirdplace fourthplace fifthplace
1:   name1      4     78     12         NY          NY         NY      Nantes         NY
2:   name2      3   3000     56     London      London         NY      Nantes     London
3:   name3      9     90     67         NA      London     London      Nantes      Brest
4:   name4      1    101     78     Nantes      Nantes     Nantes      Nantes     Nantes

更新3:如果您想使用索引号,可以执行以下操作:

# dplyr / tidyr
data1 %>%
  gather(var, val, c(3,4,6:8)) %>%
  left_join(., data2, by = c("val"="id")) %>%
  select(-val) %>%
  spread(var, place)

# data.table
dcast(melt(setDT(data1),
           measure.vars = c(3,4,6:8))[data2, on = c(value="id")],
      levels1 + value1 + value2 + value3 ~ variable, value.var = "place")

给出( data.table 输出):

   levels1 value1 value2 value3  shops after_sales_service provider seller  maker
1:   name1      4     78     12     NY                  NY       NY Nantes     NY
2:   name2      3   3000     56 London              London       NY Nantes London
3:   name3      9     90     67     NA              London   London Nantes  Brest
4:   name4      1    101     78 Nantes              Nantes   Nantes Nantes Nantes

答案 1 :(得分:2)

我们可以使用base R

执行此操作
merge(merge(data1, data2, by.x='from', by.y= 'id'),
           data2, by.x='to', by.y='id')[-(1:2)]

更新

对于您的新数据集,我们可以在将数据集转换为match而不使用“值”列时使用matrix

temp <- as.data.frame(`dim<-`(as.character(data2$place[
          match(as.matrix(data1[-ncol(data1)]),  data2$id)]),
                        dim(data1[-ncol(data1)])))
names(temp) <- head(names(data1),-1)
cbind(data1[ncol(data1)], temp)
#  values   from     to firstplace secondplace thirdplace fourthplace fifthplace
#1     12     NY Nantes         NY          NY         NY      Nantes         NY
#2     56 London  Brest     London      London         NY      Nantes     London
#3     67   <NA>   <NA>       <NA>      London     London      Nantes      Brest
#4     78 Nantes     NY     Nantes      Nantes     Nantes      Nantes     Nantes

UPDATE2

基于OP帖子中的新更新

i1 <- grep('place', names(data1))
d1 <- as.data.frame(`dim<-`(as.character(data2$place[
        match(as.matrix(data1[i1]), data2$id)]), 
          dim(data1[i1])), stringsAsFactors=FALSE)
d2 <- cbind(data1[-i1], setNames(d1, paste0('place', 1:ncol(d1))))
d2
#   levels1 value1 value2 value3 place1 place2 place3 place4 place5
#1   name1      4     78     12     NY     NY     NY Nantes     NY
#2   name2      3   3000     56 London London     NY Nantes London
#3   name3      9     90     67   <NA> London London Nantes  Brest
#4   name4      1    101     78 Nantes Nantes Nantes Nantes Nantes

UPDATE3

如果列名不同,只需更改第2步

 d2 <- cbind(data1[-i1], setNames(d1, names(data1[i1])))