R-合并两个数据帧,但ID的值有分号

时间:2013-05-27 23:11:25

标签: r join merge row collapse

这是一个后续问题: R-合并两个数据帧,但某些值中包含半冒号 贡献者已经解决了这个问题:agstudy。谢谢!

链接中讨论的实际数据有点复杂,我已经停留了一段时间。 这就是我的数据帧(df2)的样子:

myIDColumn  someName    somevalue       
AB  gsdfg   123     
CD  tfgsdfg 234     
EF  sfdgsf  365     
GH  gdfgb   53453       
IJ  sr  64564       
KL  sfsdv   4234234     
MN  ewrwe   5       
OP  dsfsss  3453        
QR  gggg    667     
ST  dss 7567        
UV  hhhhjf  55      
WX  dfadasad    8657        
YZ  ghfgh   1234        
ABC gdgfg   234455      
VCB hgjkk   5555667     

这就是我的df1的样子:

ID  someText    someThing       
AB  ada 12      
CD;EF;QR    dfsdf   13      
IJ  fgfgd   14      
KL  fgdg    15      
MN  gh  16      
OP;WX   jhjhj   17      
WW  ghjgjhgjghj 18      
YZ  kkl 19

这是我希望得到的输出:

enter image description here

我可以使用以下方法合并两个:

mm <- merge(df2,df1,by.y='ID',by.x='myIDColumn',all.y=TRUE)

但之后不知道如何继续进行。

非常感谢任何帮助。感谢。

DF1:

structure(list(ID = structure(1:8, .Label = c("AB", "CD;EF;QR", 
"IJ", "KL", "MN", "OP;WX", "WW", "YZ"), class = "factor"), someText = structure(c(1L, 
2L, 4L, 3L, 5L, 7L, 6L, 8L), .Label = c("ada", "dfsdf", "fgdg", 
"fgfgd", "gh", "ghjgjhgjghj", "jhjhj", "kkl"), class = "factor"), 
    someThing = 12:19), .Names = c("ID", "someText", "someThing"
), class = "data.frame", row.names = c(NA, -8L))

DF2:

structure(list(myIDColumn = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L, 14L, 15L, 2L, 13L), .Label = c("AB", "ABC", 
"CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "VCB", 
"WX", "YZ"), class = "factor"), someName = structure(c(9L, 15L, 
12L, 5L, 14L, 13L, 4L, 2L, 7L, 3L, 11L, 1L, 8L, 6L, 10L), .Label = c("dfadasad", 
"dsfsss", "dss", "ewrwe", "gdfgb", "gdgfg", "gggg", "ghfgh", 
"gsdfg", "hgjkk", "hhhhjf", "sfdgsf", "sfsdv", "sr", "tfgsdfg"
), class = "factor"), somevalue = c(123L, 234L, 365L, 53453L, 
64564L, 4234234L, 5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L, 
234455L, 5555667L)), .Names = c("myIDColumn", "someName", "somevalue"
), class = "data.frame", row.names = c(NA, -15L))

1 个答案:

答案 0 :(得分:2)

可能有更好的方法,但你可以创建一个临时数据帧:

df1 <- structure(list(ID = c("AB", "CD;EF;QR", "IJ", "KL", "MN", "OP;WX", 
"WW", "YZ"), someText = c("ada", "dfsdf", "fgfgd", "fgdg", "gh", 
"jhjhj", "ghjgjhgjghj", "kkl"), someThing = 12:19), .Names = c("ID", 
"someText", "someThing"), class = "data.frame", row.names = c(NA, 
-8L))


df2 <- structure(list(myIDColumn = c("AB", "CD", "EF", "GH", "IJ", "KL", 
"MN", "OP", "QR", "ST", "UV", "WX", "YZ", "ABC", "VCB"), someName = c("gsdfg", 
"tfgsdfg", "sfdgsf", "gdfgb", "sr", "sfsdv", "ewrwe", "dsfsss", 
"gggg", "dss", "hhhhjf", "dfadasad", "ghfgh", "gdgfg", "hgjkk"
), somevalue = c(123L, 234L, 365L, 53453L, 64564L, 4234234L, 
5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L, 234455L, 5555667L)), .Names = c("myIDColumn", 
"someName", "somevalue"), class = "data.frame", row.names = c(NA, 
-15L))
f <- function(x) {
    y <-  unlist(strsplit(x$ID,';'))
    data.frame(ID = x$ID, someText = x$someText, someThing = x$someThing, ID1 = y) 
}
library(plyr)
df3 <- ddply(df1, .(ID), f)

> df3
         ID    someText someThing ID1
1        AB         ada        12  AB
2  CD;EF;QR       dfsdf        13  CD
3  CD;EF;QR       dfsdf        13  EF
4  CD;EF;QR       dfsdf        13  QR
5        IJ       fgfgd        14  IJ
6        KL        fgdg        15  KL
7        MN          gh        16  MN
8     OP;WX       jhjhj        17  OP
9     OP;WX       jhjhj        17  WX
10       WW ghjgjhgjghj        18  WW
11       YZ         kkl        19  YZ

您可以将其与数据框df2合并,并汇总数据:

mm <- merge(df2,df3,by.y='ID1',by.x='myIDColumn',all.y=TRUE)
ddply(mm, .(ID,someText, someThing), summarize,  
           somevalue = paste(somevalue, collapse=','),
                 someName = paste(someName, collapse = ","))

        ID    someText someThing   somevalue            someName
1       AB         ada        12         123               gsdfg
2 CD;EF;QR       dfsdf        13 234,365,667 tfgsdfg,sfdgsf,gggg
3       IJ       fgfgd        14       64564                  sr
4       KL        fgdg        15     4234234               sfsdv
5       MN          gh        16           5               ewrwe
6    OP;WX       jhjhj        17   3453,8657     dsfsss,dfadasad
7       WW ghjgjhgjghj        18          NA                  NA
8       YZ         kkl        19        1234               ghfgh