我试图有效地将精确的肽(26个字符A-Z 1 中的短氨基酸序列)映射到蛋白质(相同字母表的较长序列)。我知道的最有效的方法是Aho-Corasick
trie(肽是关键词)。遗憾的是,我无法在R中找到一个可以使用非核苷酸字母表的AC版本(Biostrings'PDict
和Starr's match_ac
都是硬编码的DNA。)
作为一个拐杖,我一直试图并行化基本的grep方法。但我无法找到一种方法,而不会产生大量的IO开销。这是一个简短的例子:
peptides = c("FSSSGGGGGGGR","GAHLQGGAK","GGSGGSYGGGGSGGGYGGGSGSR","IISNASCTTNCLAPLAK")
if (!exists("proteins"))
{
biocLite("biomaRt", ask=F, suppressUpdates=T, suppressAutoUpdate=T)
library(biomaRt)
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
proteins = getBM(attributes=c('peptide', 'refseq_peptide'), filters='refseq_peptide', values=c("NP_000217", "NP_001276675"), mart=ensembl)
row.names(proteins) = proteins$refseq_peptide
}
library(snowfall)
library(Biostrings)
library(plyr)
sfInit(parallel=T, cpus=detectCores()-1)
allPeptideInstances = NULL
i=1
increment=100
count=nrow(proteins)
while(T)
{
print(paste(i, min(count, i+increment), sep=":"))
text_source = proteins[i:min(count, i+increment),]
text = text_source$peptide
#peptideInstances = sapply(peptides, regexpr, text, fixed=T, useBytes=T)
peptideInstances = sfSapply(peptides, regexpr, text, fixed=T, useBytes=T)
dimnames(peptideInstances) = list(text_source$refseq_peptide, colnames(peptideInstances))
sparsePeptideInstances = alply(peptideInstances, 2, .fun = function(x) {x[x > 0]}, .dims = T)
allPeptideInstances = c(allPeptideInstances, sparsePeptideInstances, recursive=T)
if (i==count | nrow(text_source) < increment)
break
i = i+increment
}
sfStop()
这里有一些问题:
peptideInstances
这里是一个密集矩阵,所以
从每个工人那里返回它是非常冗长的。我打破了它
成块,这样我就不会处理40,000(蛋白质)x 60,000
(肽)矩阵。或者,如果有人知道R中有更好的解决方案,我很乐意使用它。我已经花了足够的时间在这上面我可能会更好地实施Aho-Corasick。
1 其中一些是含糊不清的代码,但为简单起见,请忽略它。
答案 0 :(得分:1)
我学习了Rcpp并自己实施了Aho-Corasick。现在,CRAN具有良好的通用多关键字搜索advanced guide here。
以下是一些使用示例:
listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) || !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }
# simple search of multiple keywords in a single text
keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word")
oneSearch = AhoCorasickSearch(keywords, "Is Abracadabra the Magic Word?")
stopifnot(listEquals(oneSearch[[1]][[1]], list(keyword="Abra", offset=4)))
stopifnot(listEquals(oneSearch[[1]][[2]], list(keyword="cadabra", offset=8)))
stopifnot(listEquals(oneSearch[[1]][[3]], list(keyword="the", offset=16)))
stopifnot(listEquals(oneSearch[[1]][[4]], list(keyword="Magic", offset=20)))
stopifnot(listEquals(oneSearch[[1]][[5]], list(keyword="Word", offset=26)))
# search a list of lists
# * sublists are accessed by index
# * texts are accessed by index
# * non-matched texts are kept (to preserve index order)
listSearch = AhoCorasickSearchList(keywords, list(c("What in", "the world"), c("is"), "secret about", "the Magic Word?"))
stopifnot(listEquals(listSearch[[1]][[1]], list()))
stopifnot(listEquals(listSearch[[1]][[2]][[1]], list(keyword="the", offset=1)))
stopifnot(listEquals(listSearch[[2]][[1]][[1]], list(keyword="is", offset=1)))
stopifnot(listEquals(listSearch[[3]], list()))
stopifnot(listEquals(listSearch[[4]][[1]][[1]], list(keyword="the", offset=1)))
stopifnot(listEquals(listSearch[[4]][[1]][[2]], list(keyword="Magic", offset=5)))
stopifnot(listEquals(listSearch[[4]][[1]][[3]], list(keyword="Word", offset=11)))
# named search of a list of lists
# * sublists are accessed by name
# * matched texts are accessed by name
# * non-matched texts are dropped
namedSearch = AhoCorasickSearchList(keywords, list(subject=c(phrase1="What in", phrase2="the world"),
verb=c(phrase1="is"),
predicate1=c(phrase1="secret about"),
predicate2=c(phrase1="the Magic Word?")))
stopifnot(listEquals(namedSearch$subject$phrase2[[1]], list(keyword="the", offset=1)))
stopifnot(listEquals(namedSearch$verb$phrase1[[1]], list(keyword="is", offset=1)))
stopifnot(listEquals(namedSearch$predicate1, list()))
stopifnot(listEquals(namedSearch$predicate2$phrase1[[1]], list(keyword="the", offset=1)))
stopifnot(listEquals(namedSearch$predicate2$phrase1[[2]], list(keyword="Magic", offset=5)))
stopifnot(listEquals(namedSearch$predicate2$phrase1[[3]], list(keyword="Word", offset=11)))
# named search of multiple texts in a single list with keyword grouping and aminoacid alphabet
# * all matches to a keyword are accessed by name
# * non-matched keywords are dropped
proteins = c(protein1="PEPTIDEPEPTIDEDADADARARARARAKEKEKEKEPEPTIDE",
protein2="DERPADERPAPEWPEWPEEPEERAWRAWWARRAGTAGPEPTIDEKESEQUENCE")
peptides = c("PEPTIDE", "DERPA", "SEQUENCE", "KEKE", "PEPPIE")
peptideSearch = AhoCorasickSearch(peptides, proteins, alphabet="aminoacid", groupByKeyword=T)
stopifnot(listEquals(peptideSearch$PEPTIDE, list(list(keyword="protein1", offset=1),
list(keyword="protein1", offset=8),
list(keyword="protein1", offset=37),
list(keyword="protein2", offset=38))))
stopifnot(listEquals(peptideSearch$DERPA, list(list(keyword="protein2", offset=1),
list(keyword="protein2", offset=6))))
stopifnot(listEquals(peptideSearch$SEQUENCE, list(list(keyword="protein2", offset=47))))
stopifnot(listEquals(peptideSearch$KEKE, list(list(keyword="protein1", offset=29),
list(keyword="protein1", offset=31),
list(keyword="protein1", offset=33))))
stopifnot(listEquals(peptideSearch$PEPPIE, NULL))
# grouping by keyword without text names: offsets are given without reference to the text
names(proteins) = NULL
peptideSearch = AhoCorasickSearch(peptides, proteins, groupByKeyword=T)
stopifnot(listEquals(peptideSearch$PEPTIDE, list(1, 8, 37, 38)))
stopifnot(listEquals(peptideSearch$DERPA, list(1, 6)))
stopifnot(listEquals(peptideSearch$SEQUENCE, list(47)))
stopifnot(listEquals(peptideSearch$KEKE, list(29, 31, 33)))