如果在另一列中存在重复项,则如何合并一列中的行

时间:2015-11-30 18:07:40

标签: r

我的数据框如下:

structure(list(ZSSLX.10456.FastSeqA.BiopsyTumour_LCM_PS14_1105_1F_100PCent.gz = c(40.9612938016794, 
40.9612938016794, 40.9612938016794, 40.9612938016794, 40.9612938016794, 
40.9612938016794, 40.9612938016794, 40.9612938016794), Stop = c(5864876, 
5864876, 5864876, 5864876, 5864876, 5864876, 5864876, 5864876
), MergeCol = c("1:4864876:5864876", "1:4864876:5864876", "1:4864876:5864876", 
"1:4864876:5864876", "1:4864876:5864876", "1:4864876:5864876", 
"1:4864876:5864876", "1:4864876:5864876"), hgnc_symbol = c("MIR4417", 
"MIR4689", "", "", "NPHP4", "", "", ""), chromosome_name = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("ZSSLX.10456.FastSeqA.BiopsyTumour_LCM_PS14_1105_1F_100PCent.gz", 
"Stop", "MergeCol", "hgnc_symbol", "chromosome_name"), row.names = c(130L, 
289L, 432L, 446L, 1681L, 3467L, 3468L, 3469L), class = "data.frame")

如果MergCol中有重复项,我想合并hgnc_symbol中的值。在这个例子中,我最终会得到

structure(list(ZSSLX.10456.FastSeqA.BiopsyTumour_LCM_PS14_1105_1F_100PCent.gz = 40.9612938016794, Stop = 5864876, MergeCol = "1:4864876:5864876", hgnc_symbol = "MIR4417,MIR4689,NPHP4", chromosome_name = 1L), .Names = c("ZSSLX.10456.FastSeqA.BiopsyTumour_LCM_PS14_1105_1F_100PCent.gz", "Stop", "MergeCol", "hgnc_symbol", "chromosome_name"), row.names = 130L, class = "data.frame")

是否有一种方法可以根据另一列重复来折叠一列中的行?

1 个答案:

答案 0 :(得分:1)

似乎你试图让数据框执行列表可以执行的所有操作,但如果您愿意将矢量组合到字符串中,则可以这样做:

library(plyr)
ddply(df,names(df)[-4],summarize,
      hgnc_symbol=paste(unique(hgnc_symbol[hgnc_symbol!=""]),collapse=","))

#   ZSSLX.10456.FastSeqA.BiopsyTumour_LCM_PS14_1105_1F_100PCent.gz    Stop          MergeCol  chromosome_name             hgnc_symbol
# 1                                                       40.96129 5864876 1:4864876:5864876                1 MIR4417, MIR4689, NPHP4

请注意,这假设每次MergeCol重复时除了hgnc_symbol之外的所有其他内容都会重复。您可以将相同的概念应用于其他不一定重复的列。