df1 <-
Gene GeneLocus
CPA1|1357 chr7:130020290-130027948:+
GUCY2D|3000 chr17:7905988-7923658:+
UBC|7316 chr12:125396194-125399577:-
C11orf95|65998 chr11:63527365-63536113:-
ANKMY2|57037 chr7:16639413-16685398:-
预期产出
df2 <-
Gene.1 Gene.2 chr start end
CPA1 1357 7 130020290 130027948
GUCY2D 3000 17 7905988 7923658
UBC 7316 12 125396194 125399577
C11orf95 65998 11 63527365 63536113
ANKMY2 57037 7 16639413 16685398]]
我试过这种方式..
install.packages("splitstackshape")
library(splitstackshape)
df1 <- cSplit(df1,"Gene", sep="|", direction="wide", fixed=T)
df1 <- cSplit(df1,"GeneLocus",sep=":",direction="wide", fixed=T)
df1 <- cSplit(df1,"GeneLocus_2",sep="-",direction="wide", fixed=T)
df1 <- data.frame(df1)
df2$GeneLocus_1 <- gsub("chr","", df1$GeneLocus_1)
我想知道是否有其他替代方法以更简单的方式完成
答案 0 :(得分:2)
这里你去......只是忽略不影响输出的警告;它实际上具有删除链信息(:+
或:-
)的副作用。
library(tidyr)
library(dplyr)
df1 %>% separate(Gene, c("Gene.1","Gene.2")) %>% separate(GeneLocus, c("chr","start","end")) %>% mutate(chr=sub("chr","",chr))
输出:
Gene.1 Gene.2 chr start end
1 CPA1 1357 7 130020290 130027948
2 GUCY2D 3000 17 7905988 7923658
3 UBC 7316 12 125396194 125399577
4 C11orf95 65998 11 63527365 63536113
5 ANKMY2 57037 7 16639413 16685398
答案 1 :(得分:1)
我建议采用以下方法:
cSplit
&#34;余额&#34;根据检测到的输出列数分割列。因此,由于第一列在拆分时只会产生2列,但第二列会产生4列,因此您需要从结果中删除第3列和第4列。library(splitstackshape)
GLPat <- "^chr(\\d+):(\\d+)-(\\d+):([+-])$"
cSplit(as.data.table(mydf)[, GeneLocus := gsub(
GLPat, "\\1|\\2|\\3|\\4", GeneLocus)], names(mydf), "|")[
, 3:4 := NULL, with = FALSE][]
# Gene_1 Gene_2 GeneLocus_1 GeneLocus_2 GeneLocus_3 GeneLocus_4
# 1: CPA1 1357 7 130020290 130027948 +
# 2: GUCY2D 3000 17 7905988 7923658 +
# 3: UBC 7316 12 125396194 125399577 -
# 4: C11orf95 65998 11 63527365 63536113 -
# 5: ANKMY2 57037 7 16639413 16685398 -
或者,您可以尝试{&#34; SOfun&#34;}中的col_flatten
包,你可以做:
library(SOfun)
Pat <- "^chr(\\d+):(\\d+)-(\\d+):([+-])$"
Fun <- function(invec) strsplit(gsub(Pat, "\\1|\\2|\\3|\\4", invec), "|", TRUE)
col_flatten(as.data.table(mydf)[, lapply(.SD, Fun)], names(mydf), drop = TRUE)
# Gene_1 Gene_2 GeneLocus_1 GeneLocus_2 GeneLocus_3 GeneLocus_4
# 1: CPA1 1357 7 130020290 130027948 +
# 2: GUCY2D 3000 17 7905988 7923658 +
# 3: UBC 7316 12 125396194 125399577 -
# 4: C11orf95 65998 11 63527365 63536113 -
# 5: ANKMY2 57037 7 16639413 16685398 -
SOfun仅适用于GitHub,因此您可以使用以下命令安装:
source("http://news.mrdwab.com/install_github.R")
install_github("mrdwab/SOfun")