我有一个R数据帧,如下所示-
df <- data.frame(
FDR = c (0.009, 0.007, 0.007),
Probe_ID = c("1555272_at", "1557203_at", "1557384_at"),
Gene.Symbol = c("RSPH10B2///RSPH10B","PABPC1L2B///PABPC1L2A","LOC100506639///ZNF131"),
Gene.ID = c("728194///222967","645974///340529","100506639///7690"))
df
FDR Probe_ID Gene.Symbol Gene.ID
1 0.009 1555272_at RSPH10B2///RSPH10B 728194///222967
2 0.007 1557203_at PABPC1L2B///PABPC1L2A 645974///340529
3 0.007 1557384_at LOC100506639///ZNF131 100506639///7690
我想基于模式df$Gene.symbol
根据列///
的行值拆分数据帧。结果数据框应类似于-
FDR Probe_ID Gene.symbol Gene.ID
0.009 15111_at RSPH10B2 728194
0.009 15111_at RSPH10B 222967
0.007 15222_at PABPC1L2B 645974
0.007 15222_at PABPC1L2A 340529
0.007 15333_at LOC100506639 100506639
0.007 15333_at ZNF131 7690
我尝试了以下代码,但没有用,并产生了具有重复元素的列-
s <- strsplit(gsub("///","",df$Gene.symbol),", ",fixed = TRUE)
res <- data.frame(Id = rep(df$Gene.symbol, lengths(s)), result = unlist(s))
ans <- merge(annotated,res)
谢谢!
答案 0 :(得分:2)
使用dplyr
的解决方案:
library(dplyr)
df %>%
separate_rows(Gene.Symbol, Gene.ID, sep = "///")
# A tibble: 6 x 4
FDR Probe_ID Gene.Symbol Gene.ID
<dbl> <chr> <chr> <chr>
1 0.009 1555272_at RSPH10B2 728194
2 0.009 1555272_at RSPH10B 222967
3 0.007 1557203_at PABPC1L2B 645974
4 0.007 1557203_at PABPC1L2A 340529
5 0.007 1557384_at LOC100506639 100506639
6 0.007 1557384_at ZNF131 7690
答案 1 :(得分:0)
将strsplit
与by
一起使用。
res <- do.call(rbind, by(df, df$Probe_ID, function(x) {
cbind(`rownames<-`(x[,1:2], NULL), sapply(x[,-(1:2)], strsplit, "///"))
}))
res
# FDR Probe_ID Gene.Symbol Gene.ID
# 1555272_at.1 0.009 1555272_at RSPH10B2 728194
# 1555272_at.2 0.009 1555272_at RSPH10B 222967
# 1557203_at.1 0.007 1557203_at PABPC1L2B 645974
# 1557203_at.2 0.007 1557203_at PABPC1L2A 340529
# 1557384_at.1 0.007 1557384_at LOC100506639 100506639
# 1557384_at.2 0.007 1557384_at ZNF131 7690
注意::如果得到Error in FUN(X[[i]], ...) : non-character argument
,则变量可能包含factors
。您可能想做df[2:4] <- lapply(df[2:4], as.character)
并尽快更新R版本。
数据:
dat <- structure(list(FDR = c(0.009, 0.007, 0.007), Probe_ID = c("1555272_at",
"1557203_at", "1557384_at"), Gene.Symbol = c("RSPH10B2///RSPH10B",
"PABPC1L2B///PABPC1L2A", "LOC100506639///ZNF131"), Gene.ID = c("728194///222967",
"645974///340529", "100506639///7690")), class = "data.frame", row.names = c(NA,
-3L))