如何将数据从一个数据框添加到另一个数据框

时间:2018-01-09 02:43:07

标签: r

我有两个数据集

df1 <- structure(list(V1 = structure(1:12, .Label = c("A0A0A6YXQ7", 
"A0A0A6YXS5", "A0A0A6YXW8", "A0A0A6YXX6", "A0A0A6YXZ1", "A0A0A6YY28", 
"A0A0A6YY43", "A0A0A6YY47", "A0A0A6YY78", "A0A0A6YY89", "A0A0A6YY91", 
"A0A0A7NQN9"), class = "factor")), .Names = "from", class = "data.frame", row.names = c(NA, 
-12L))

df2 <- structure(list(from = structure(1:8, .Label = c("A0A0A6YXQ7", 
"A0A0A6YXW8", "A0A0A6YXX6", "A0A0A6YXZ1", "A0A0A6YY28", "A0A0A6YY47", 
"A0A0A6YY78", "A0A0A6YY91"), class = "factor"), to = structure(c(4L, 
5L, 1L, 2L, 6L, 3L, 3L, 3L), .Label = c("Arhgap15", "Igtp", "MumuTL", 
"Myo1f", "Pak2", "pol"), class = "factor")), .Names = c("from", 
"to"), class = "data.frame", row.names = c(NA, -8L))

df1和df2有一个名为from的列 df1中的所有字符串都应该在df2中。如果不是,我想将它们显示在df1中的确切顺序,对于列to,它们会得到NA

例如,在df2中缺少以下字符串

A0A0A6YXS5和A0A0A6YY43以及A0A0A6YY89和A0A0A7NQN9

所以输出看起来像这样

From       To
A0A0A6YXQ7  Myo1f
A0A0A6YXS5  NA
A0A0A6YXW8  Pak2
A0A0A6YXX6  Arhgap15
A0A0A6YXZ1  Igtp
A0A0A6YY28  pol
A0A0A6YY43  NA
A0A0A6YY47  MumuTL
A0A0A6YY78  MumuTL
A0A0A6YY89  NA
A0A0A6YY91  MumuTL
A0A0A7NQN9  NA

一个不太好的解决方案是合并这两个数据框

merge(df1, df2, by = "from", all.x = TRUE)

适用于某些数据但不适用于其他数据

让我们看看更大的数据

df1<- structure(list(from = structure(c(10L, 11L, 12L, 13L, 14L, 15L, 
16L, 17L, 18L, 19L, 20L, 21L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L), .Label = c("A0A023J6K5", "A0A023J6L7", "A0A023J6M1", "A0A067XG53", 
"A0A067XKM5", "A0A067XKP8", "A0A067XKR4", "A0A067XKW4", "A0A067XKW7", 
"A0A0A6YXQ7", "A0A0A6YXS5", "A0A0A6YXW8", "A0A0A6YXX6", "A0A0A6YXZ1", 
"A0A0A6YY28", "A0A0A6YY43", "A0A0A6YY47", "A0A0A6YY78", "A0A0A6YY89", 
"A0A0A6YY91", "A0A0A7NQN9"), class = "factor")), .Names = "from", class = "data.frame", row.names = c(NA, 
-21L))

df2 <- structure(list(from = structure(c(10L, 11L, 12L, 13L, 14L, 15L, 
16L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), .Label = c("A0A023J6K5", 
"A0A023J6L7", "A0A023J6M1", "A0A023T778", "A0A067XG53", "A0A067XKM5", 
"A0A067XKP8", "A0A067XKR4", "A0A067XKW4", "A0A0A6YXQ7", "A0A0A6YXW8", 
"A0A0A6YXX6", "A0A0A6YXZ1", "A0A0A6YY28", "A0A0A6YY47", "A0A0A6YY78", 
"A0A0A6YY91"), class = "factor"), to = structure(c(7L, 10L, 1L, 
4L, 11L, 6L, 6L, 6L, 9L, 8L, 9L, 5L, 2L, 8L, 3L, 9L, 8L), .Label = c("Arhgap15", 
"Cask", "COXI", "Igtp", "Magohb", "MumuTL", "Myo1f", "ND1", "ND4", 
"Pak2", "pol"), class = "factor")), .Names = c("from", "to"), class = "data.frame", row.names = c(NA, 
-17L))

然后我需要获得以下输出

from    to
A0A0A6YXQ7  Myo1f
A0A0A6YXS5  
A0A0A6YXW8  Pak2
A0A0A6YXX6  Arhgap15
A0A0A6YXZ1  Igtp
A0A0A6YY28  pol
A0A0A6YY43  
A0A0A6YY47  MumuTL
A0A0A6YY78  MumuTL
A0A0A6YY89  
A0A0A6YY91  MumuTL
A0A0A7NQN9  
A0A023J6K5  ND4
A0A023J6L7  ND1
A0A023J6M1  ND4
A0A067XG53  Cask
A0A067XKM5  ND1
A0A067XKP8  COXI
A0A067XKR4  ND4
A0A067XKW4  ND1
A0A067XKW7  

但如果我这样做

#> merge(df1, df2, by = "from", all.x = TRUE)
#         from       to
#1  A0A023J6K5      ND4
#2  A0A023J6L7      ND1
#3  A0A023J6M1      ND4
#4  A0A067XG53     Cask
#5  A0A067XKM5      ND1
#6  A0A067XKP8     COXI
#7  A0A067XKR4      ND4
#8  A0A067XKW4      ND1
#9  A0A067XKW7     <NA>
#10 A0A0A6YXQ7    Myo1f
#11 A0A0A6YXS5     <NA>
#12 A0A0A6YXW8     Pak2
#13 A0A0A6YXX6 Arhgap15
#14 A0A0A6YXZ1     Igtp
#15 A0A0A6YY28      pol
#16 A0A0A6YY43     <NA>
#17 A0A0A6YY47   MumuTL
#18 A0A0A6YY78   MumuTL
#19 A0A0A6YY89     <NA>
#20 A0A0A6YY91   MumuTL
#21 A0A0A7NQN9     <NA>

我基本上希望保持df1的顺序相同,只需在to中将值粘贴到from相似的地方。

1 个答案:

答案 0 :(得分:0)

dplyr::left_join不同,

merge保留了行顺序。使用更大的示例数据:

dplyr::left_join(df1, df2)
# Joining, by = "from"
#          from       to
# 1  A0A0A6YXQ7    Myo1f
# 2  A0A0A6YXS5     <NA>
# 3  A0A0A6YXW8     Pak2
# 4  A0A0A6YXX6 Arhgap15
# 5  A0A0A6YXZ1     Igtp
# 6  A0A0A6YY28      pol
# 7  A0A0A6YY43     <NA>
# 8  A0A0A6YY47   MumuTL
# 9  A0A0A6YY78   MumuTL
# 10 A0A0A6YY89     <NA>
# 11 A0A0A6YY91   MumuTL
# 12 A0A0A7NQN9     <NA>
# 13 A0A023J6K5      ND4
# 14 A0A023J6L7      ND1
# 15 A0A023J6M1      ND4
# 16 A0A067XG53     Cask
# 17 A0A067XKM5      ND1
# 18 A0A067XKP8     COXI
# 19 A0A067XKR4      ND4
# 20 A0A067XKW4      ND1
# 21 A0A067XKW7     <NA>
# Warning message:
# Column `from` joining factors with different levels, coercing to character vector