Question

我有一个for循环的以下函数：

getSequences <- function(input.seq){
peptide.result <- c()
for (i in 1:nrow(peptides.df)) {
    peptide.seq <- substr(input.seq, peptides.df$StartAA[i], peptides.df$EndAA[i])
    peptide.info <- data.frame(cbind(peptide.name = peptides.df$Name[i], peptide.seq)) 
    peptide.result <- rbind(peptide.result, peptide.info)
}  
    return(peptide.result)
}

test.results <- getSequences(input.seq)

该功能采用氨基酸序列，然后使用该输入和具有起始和终止位置的肽基质，它在不同位置提取序列的子集以产生一组肽。序列：

例如氨基酸序列：

input.seq <- ("MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE")

以下是peptide.df的前几行：

   Name StartAA EndAA
peptide_1   25    48
peptide_2   33    56
peptide_3   41    64

Current Output peptide.result：

peptide.name    peptide.sequence
peptide_1   QNYWEHPYQNSDVYRPINEHREHP
peptide_2   QNSDVYRPINEHREHPKEYEYPLH
peptide_3   INEHREHPKEYEYPLHQEHTYQQE

如何扩展它以获取包含样本＃及其输入序列的数据帧。对于每个样本＃及其序列，我想生成一组肽，就像示例中的那样。

新输入：带sample_sequences的数据帧（200个带输入序列的样本）

sample1     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample2     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample3     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
...
sample200   MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE

新输出：sample_peptides

sample1 peptide_1   QNYWEHPYQNSDVYRPINEHREHP
sample1 peptide_2   QNSDVYRPINEHREHPKEYEYPLH
sample1 peptide_3   INEHREHPKEYEYPLHQEHTYQQE
sample2 peptide_1   QNYWEHPYQNSDVYRPINEHREHP
sample2 peptide_2   QNSDVYRPINEHREHPKEYEYPLH
sample2 peptide_3   INEHREHPKEYEYPLHQEHTYQQE
sample3 peptide_1   QNYWEHPYQNSDVYRPINEHREHP
sample3 peptide_2   QNSDVYRPINEHREHPKEYEYPLH
sample3 peptide_3   INEHREHPKEYEYPLHQEHTYQQE
...
sample200   peptide_1   QNYWEHPYQNSDVYRPINEHREHP
sample200   peptide_2   QNSDVYRPINEHREHPKEYEYPLH
sample200   peptide_3   INEHREHPKEYEYPLHQEHTYQQE

Answer 1

您可以避免使用tidyr和dplyr的循环。您可以使用crossing扩展所有可能的肽的sample_sequences。然后，使用mutate

只是一个简单的substr

library(dplyr);library(tidyr)
peptides.df <- read.table(text="   Name StartAA EndAA
peptide_1   25    48
peptide_2   33    56
peptide_3   41    64",header=TRUE,stringsAsFactors=FALSE)

sample_sequences <-read.table(text=" sample sequence
sample1     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample2     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE
sample3     MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE",header=TRUE,stringsAsFactors=FALSE)

crossing(sample_sequences,peptides.df)%>%
  mutate(peptide.sequence=substr(sequence, StartAA, EndAA))

   sample                                                         sequence      Name StartAA EndAA         peptide.sequence
1 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1      25    48 QNYWEHPYQNSDVYRPINEHREHP
2 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2      33    56 QNSDVYRPINEHREHPKEYEYPLH
3 sample1 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3      41    64 INEHREHPKEYEYPLHQEHTYQQE
4 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1      25    48 QNYWEHPYQNSDVYRPINEHREHP
5 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2      33    56 QNSDVYRPINEHREHPKEYEYPLH
6 sample2 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3      41    64 INEHREHPKEYEYPLHQEHTYQQE
7 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_1      25    48 QNYWEHPYQNSDVYRPINEHREHP
8 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_2      33    56 QNSDVYRPINEHREHPKEYEYPLH
9 sample3 MRKLYCVLLLSAFEFTYMINFGRGQNYWEHPYQNSDVYRPINEHREHPKEYEYPLHQEHTYQQE peptide_3      41    64 INEHREHPKEYEYPLHQEHTYQQE

需要帮助扩展R

1 个答案: