Question

此方法具有查找复杂重复的功能。

我无法从代码输出中找出答案。

样本输入序列： - ＆＃39; TGTATACACACTGTATACACACACTGTATAC＆＃39;。

输出为((TGTAT)1 (AC)*)3

def word_extract(seq,wsize):
    words=[]
    for i in range(0,len(seq)-wsize+1):
        words.append(seq[i:i+wsize])
    words=set(words)
    return list(words)

def find_occurs(seq,word):
    cnt=seq.count(word)
    locs=[]
    k=0
    for i in range(cnt):
        k=seq.find(word,k)
        locs.append(k)
        k+=1
    return locs

def all_dists(locs):
    dists=[]
    for i in range(len(locs)):
        for j in range(i):
            dists.append(locs[i]-locs[j])
    return dists

def histogram(upper,dists):
    nbins=upper+1
    hist=[]
    for i in range(nbins):
        hist.append(0)
    for i in range(len(dists)):
        k=dists[i]
        if 0<k<nbins:
            hist[k]+=1
    return hist

def find_repeats(hist,word,gamma=10):
    mx=hist.max()
    here=hist.argmax()
    if mx<gamma:
        ok=0
        return()
    else:
        return(mx,word,here)


seq="tgtatacacactgtatacacacactgtatac"
wsize=2
words=word_extract(seq,wsize)
for i in range (len(words)):
    locs=find_occurs(seq,words[i])
    dists=all_dists(locs)
    hist=histogram(20,dists)
    print (words[i],locs,dists)
    print (hist)

如何从Hauth的直方图中推断出核苷酸序列中复杂的重复序列？

0 个答案: