我有5个细菌基因组Fasta文件,我需要为每个样本找到最小的序列,以将其与其他样本区分开。我不知道是否有一个软件包可以执行此操作,还是应该编写自己的代码。我的路上有个大问题,您能指导我解决这个问题吗?
答案 0 :(得分:0)
可能是一种更优雅/更快的方法,但这是使用基数R的一种可能性(假设我正确理解了您的问题)
#generate example sequenes
seqs.to.check <- lapply(sample(10:20, 10),
function(x) sample(letters, x, replace=TRUE))
#get list of lists of all sub-sequences for each example sequence
all.sub.seqs <- lapply(seqs.to.check, function(seq){
sub.seqs <- list()
for(i in 1:length(seq)) for(j in 0:(length(seq) - i)) sub.seqs[[length(sub.seqs) + 1]] <- seq[i:(i+j)]
sub.seqs
})
#get list of lists of minimum length distinct sub-sequences for example sequence
min.distinct.seqs <- lapply(seq_along(seqs.to.check), function(seq.i){
#extract sub-sequences of sequence to check
sub.seqs <- all.sub.seqs[[seq.i]]
#extract all sub-sequences of other sequences
other.sub.seqs <- unlist(all.sub.seqs[-seq.i], recursive=FALSE)
#identify indices for which no matching sub-sequence can be found
no.match.indices <- which(sapply(sub.seqs, function(x) Position(function(z) identical(z, x), other.sub.seqs, nomatch=0))==0)
#return a list of the shortest non-matching sub-sequences
lengths.no.matches <- unlist(lapply(sub.seqs[no.match.indices], length))
sub.seqs[no.match.indices][lengths.no.matches == min(lengths.no.matches)]
})