我有一个for循环的以下函数:
getSequences <- function(input.seq){
peptide.result <- c()
for (i in 1:nrow(peptides.df)) {
peptide.seq <- substr(input.seq, peptides.df$StartAA[i], peptides.df$EndAA[i])
peptide.info <- data.frame(cbind(peptide.name = peptides.df$Name[i], peptide.seq))
peptide.result <- rbind(peptide.result, peptide.info)
}
return(peptide.result)
}
test.results <- getSequences(input.seq)
该功能采用氨基酸序列,然后使用该输入和具有起始和终止位置的肽基质,它在不同位置提取序列的子集以产生一组肽。 序列:
例如氨基酸序列:
input.seq <- ("MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE")
以下是peptide.df的前几行:
Name StartAA EndAA
peptide_1 25 48
peptide_2 33 56
peptide_3 41 64
Current Output peptide.result:
peptide.name peptide.sequence
peptide_1 QNYWEHPYQNSDVYRPINEHREHP
peptide_2 QNSDVYRPINEHREHPKEYEYPLH
peptide_3 INEHREHPKEYEYPLHQEHTYQQE
如何扩展它以获取包含样本#及其输入序列的数据帧。对于每个样本#及其序列,我想生成一组肽,就像示例中的那样。
新输入:带sample_sequences的数据帧(200个带输入序列的样本)
sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
...
sample200 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
新输出:sample_peptides
sample1 peptide_1 QNYWEHPYQNSDVYRPINEHREHP
sample1 peptide_2 QNSDVYRPINEHREHPKEYEYPLH
sample1 peptide_3 INEHREHPKEYEYPLHQEHTYQQE
sample2 peptide_1 QNYWEHPYQNSDVYRPINEHREHP
sample2 peptide_2 QNSDVYRPINEHREHPKEYEYPLH
sample2 peptide_3 INEHREHPKEYEYPLHQEHTYQQE
sample3 peptide_1 QNYWEHPYQNSDVYRPINEHREHP
sample3 peptide_2 QNSDVYRPINEHREHPKEYEYPLH
sample3 peptide_3 INEHREHPKEYEYPLHQEHTYQQE
...
sample200 peptide_1 QNYWEHPYQNSDVYRPINEHREHP
sample200 peptide_2 QNSDVYRPINEHREHPKEYEYPLH
sample200 peptide_3 INEHREHPKEYEYPLHQEHTYQQE
答案 0 :(得分:0)
您可以避免使用tidyr
和dplyr
的循环。您可以使用crossing
扩展所有可能的肽的sample_sequences。然后,使用mutate
substr
library(dplyr);library(tidyr)
peptides.df <- read.table(text=" Name StartAA EndAA
peptide_1 25 48
peptide_2 33 56
peptide_3 41 64",header=TRUE,stringsAsFactors=FALSE)
sample_sequences <-read.table(text=" sample sequence
sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE",header=TRUE,stringsAsFactors=FALSE)
crossing(sample_sequences,peptides.df)%>%
mutate(peptide.sequence=substr(sequence, StartAA, EndAA))
sample sequence Name StartAA EndAA peptide.sequence
1 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1 25 48 QNYWEHPYQNSDVYRPINEHREHP
2 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2 33 56 QNSDVYRPINEHREHPKEYEYPLH
3 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3 41 64 INEHREHPKEYEYPLHQEHTYQQE
4 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1 25 48 QNYWEHPYQNSDVYRPINEHREHP
5 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2 33 56 QNSDVYRPINEHREHPKEYEYPLH
6 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3 41 64 INEHREHPKEYEYPLHQEHTYQQE
7 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1 25 48 QNYWEHPYQNSDVYRPINEHREHP
8 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2 33 56 QNSDVYRPINEHREHPKEYEYPLH
9 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3 41 64 INEHREHPKEYEYPLHQEHTYQQE