我在名为PeakBoundaries的数据框中有如下列:
chrom
chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509
我想将列分开,以便列在数据框中如下所示:
chr chrStart chrEnd
chr11 69464719 69502928
chr7 55075808 55093954
chr8 128739772 128762863
chr3 169389459 169490555
等
我尝试了一种正则表达式方法,但在获得匹配进入新列方面没有任何意义:
PeakBoundaries$chrOnly <- PeakBoundaries[grep("\\w+?=\\:"),PeakBoundaries$chrom]
我遇到了错误:
[.data.frame
中的错误(PeakBoundaries,grep(“\ w +?= \:”),PeakBoundaries $ chrom):
选择了未定义的列
答案 0 :(得分:4)
试试这个 - 不需要正则表达式,只需要strsplit
函数:
dat <- read.table(text="chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509", stringsAsFactors=FALSE)
dat[,2:4] <- matrix(unlist(strsplit(dat[,1],split = "\\:|\\-")), ncol=3, byrow=TRUE)
colnames(dat) <- c("chrom", "chr", "chrStart", "chrEnd")
# Convert last two columns from character to numeric:
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)
结果
> res
chrom chr chrStart chrEnd
1 chr11:69464719-69502928 chr11 69464719 69502928
2 chr7:55075808-55093954 chr7 55075808 55093954
3 chr8:128739772-128762863 chr8 128739772 128762863
4 chr3:169389459-169490555 chr3 169389459 169490555
5 chr17:37848534-37877201 chr17 37848534 37877201
6 chr19:30306758-30316875 chr19 30306758 30316875
7 chr1:150496857-150678056 chr1 150496857 150678056
8 chr12:69183279-69260755 chr12 69183279 69260755
9 chr11:77610143-77641464 chr11 77610143 77641464
10 chr8:38191804-38260814 chr8 38191804 38260814
11 chr12:58135797-58156509 chr12 58135797 58156509
修改强>
您可以仅使用现有数据框执行所有操作。将dat[,1]
替换为PeakBoundaries$chrom
,将dat[,2:4]
替换为PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)]
,您应该拥有它!
按OP编辑
好的,所以我觉得我的数据集有些奇怪,但是我已经用Dominic的帮助对它进行了排序,现在就是这样:
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries <- PeakBoundaries[-1,]
PeakBoundaries <- as.data.frame(PeakBoundaries)
PeakBoundaries$PeakBoundaries <-
as.character(PeakBoundaries$PeakBoundaries)
PeakBoundaries[,(ncol(PeakBoundaries)+1):(ncol(PeakBoundaries)+3)] <-
matrix(unlist(strsplit(PeakBoundaries$PeakBoundaries,
split = "\\:|\\-")), ncol=3, byrow=TRUE)
答案 1 :(得分:2)
Dominic的答案的缩短版本,使插入成为一行:
dat <- data.frame(chrom = readLines(textConnection("chr11:69464719-69502928
chr7:55075808-55093954
chr8:128739772-128762863
chr3:169389459-169490555
chr17:37848534-37877201
chr19:30306758-30316875
chr1:150496857-150678056
chr12:69183279-69260755
chr11:77610143-77641464
chr8:38191804-38260814
chr12:58135797-58156509")) )
dat[, c('chr','chrStart','chrEnd')] <- t( sapply( dat$chrom, function(s) { str_split(s, '[:-]') [[1]] } ) )
dat$chrStart <- as.numeric(dat$chrStart)
dat$chrEnd <- as.numeric(dat$chrEnd)
答案 2 :(得分:1)
我们可以试试
library(tidyr)
extract(dat, chrom, into=c('chr', 'chrStart', 'chrEnd'),
'([^:]+):([^-]+)-(.*)', convert=TRUE)
# chr chrStart chrEnd
#1 chr11 69464719 69502928
#2 chr7 55075808 55093954
#3 chr8 128739772 128762863
#4 chr3 169389459 169490555
#5 chr17 37848534 37877201
#6 chr19 30306758 30316875
#7 chr1 150496857 150678056
#8 chr12 69183279 69260755
#9 chr11 77610143 77641464
#10 chr8 38191804 38260814
#11 chr12 58135797 58156509
使用devel版本data.table
的更快的选项。我们可以从here
v1.9.5
library(data.table) # v1.9.5+
nm1 <- c('chr', 'chrStart', 'chrEnd')
res <- setDT(tstrsplit(dat$chrom, '[:-]', type.convert=TRUE))
setnames(res, nm1)
res
# chr chrStart chrEnd
# 1: chr11 69464719 69502928
# 2: chr7 55075808 55093954
# 3: chr8 128739772 128762863
# 4: chr3 169389459 169490555
# 5: chr17 37848534 37877201
# 6: chr19 30306758 30316875
# 7: chr1 150496857 150678056
# 8: chr12 69183279 69260755
# 9: chr11 77610143 77641464
#10: chr8 38191804 38260814
#11: chr12 58135797 58156509
或
library(splitstackshape)
setnames(cSplit(dat, 'chrom', ':|-',fixed=FALSE,
type.convert=TRUE), nm1)[]
dat <- structure(list(chrom = structure(c(2L, 9L, 10L, 8L, 6L, 7L, 1L,
5L, 3L, 11L, 4L), .Label = c("chr1:150496857-150678056",
"chr11:69464719-69502928",
"chr11:77610143-77641464", "chr12:58135797-58156509",
"chr12:69183279-69260755",
"chr17:37848534-37877201", "chr19:30306758-30316875",
"chr3:169389459-169490555",
"chr7:55075808-55093954", "chr8:128739772-128762863",
"chr8:38191804-38260814"
), class = "factor")), .Names = "chrom", row.names = c(NA, -11L
), class = "data.frame")