我有这两个数据集: 之前包含5列(chromsome,start,end,行号,得分)
chrI 861 870 87 5
chrI 871 880 88 11
chrI 881 890 89 11
chrI 891 900 90 19
chrI 901 910 91 19
chrI 911 920 92 20
chrI 921 930 93 20
chrI 931 940 94 20
chrI 941 950 95 19
chrI 951 960 96 19
chrI 961 970 97 19
chrI 971 980 98 19
chrI 981 990 99 25
chrI 991 1000 100 20
chrI 1001 1010 101 20
chrI 1011 1020 102 20
chrI 1021 1030 103 20
chrI 1031 1040 104 15
chrI 1041 1050 105 14
chrI 1051 1060 106 14
chrI 1061 1070 107 13
chrI 1071 1080 108 13
chrI 1081 1090 109 13
chrI 1091 1100 110 7
chrI 1101 1110 111 7
峰值包含4列(chromsome,start,end,value)
"chrI" 880 1091 383
"chrI" 1350 1601 302
"chrI" 1680 1921 241
"chrI" 2220 2561 322
"chrI" 2750 2761 18
"chrI" 3100 3481 420
"chrI" 3660 4211 793
"chrI" 4480 4491 20
"chrI" 4710 4871 195
"chrI" 5010 5261 238
对于 Peaks 的每一行,我想在之前中提取相应的行(例如,第一行的880和1091之间的所有行),找到最高的得分值并将其写入新文件。 的输出
chrI 981 990 99 25
为此,我已经写了这个函数:
summit <- function(x,y,output){
y<- Before
chrom <- x[1]
start <-x[2]
end <-x[3]
startLine <- y[which((y$V1 == chrom) & (y$V2==start)),]
endLine <- y[which((y$V1 == chrom) & (y$V3==end)),]
Subset <- y[which((y$V2 >= startLine$V2) & (y$V3 <= endLine$V2))]
maximum <- Subset[which(Subset$V4 == max(Subset$V4))]
output <- print(maximum)
}
apply(Peaks,1,summit,output = 'peaks_list.bed')
我没有收到错误消息,但它在整个晚上都没有给我结果,所以我猜我的代码有问题,但我不知道是什么。
我也试试这个:
Peaks_Range <- GRanges(seqnames=Peaks$V1, ranges=IRanges(start=Peaks$V2, end=Peaks$V3))
Before_Range <- GRanges(seqnames=Before$V1, ranges=IRanges(start=Before$V2, end=Before$V3),score=Before$V5)
Merged <- mergeByOverlaps(Peaks_Range,Before_Range)
Merged <- as.data.frame(Merged)
for (i in 1:nrow(Peaks)){
start <-Peaks[i,2]
end <-Peaks[i,3]
Subset <- subset(Merged,Merged$Peaks_Range.start == start)
maximum <- as.numeric(max(Subset$score))
summit <- Subset[which(Subset$score == maximum),]
write.table(summit,'peaks_list.bed', sep="\t", append=T, col.name = FALSE, row.names = FALSE, quote = FALSE)
}
它有效(我认为),但这非常慢,所以我搜索另一种方法来做到这一点。
有没有人有任何想法?
答案 0 :(得分:0)
您可以使用cut
来帮助您获取索引。
setwd("/home/wang/Downloads")
before <- read.table("before.txt", header = F, stringsAsFactors = F)
colnames(before) <- c("chromosome", "start", "end", "line number", "score")
peaks <- read.table("peaks.txt", header = F, stringsAsFactors = F, quote = "\"")
colnames(peaks) <- c("chromosome", "start", "end", "value")
summit <- function(peaks_vec){
chromosome = peaks_vec[1]
start = as.numeric(peaks_vec[2])
end = as.numeric(peaks_vec[3])
filter_before = subset(before, chromosome == chromosome)
up_index = cut(end, filter_before[,2], labels = F) +1
down_index = cut(start, filter_before[,2], labels = F) +1
if(!is.na(down_index) & !is.na(up_index)){
new_filter_before = filter_before[down_index : up_index, ]
max_index = which.max(new_filter_before[,5])
return(unlist(new_filter_before[max_index,]))
}else {
return(rep(NA, 5)) # you can input what you want.
}
}
result <- t(apply(as.matrix(peaks), 1, summit))
remove_na_result <- as.data.frame(na.omit(result))
colnames(remove_na_result) <- colnames(before)
最终输出是:
chromsome start end line number score
1 chrI 981 990 99 25
希望我的回答很有帮助。