此方法具有查找复杂重复的功能。
我无法从代码输出中找出答案。
样本输入序列: - ' TGTATACACACTGTATACACACACTGTATAC'。
输出为((TGTAT)1 (AC)*)3
def word_extract(seq,wsize):
words=[]
for i in range(0,len(seq)-wsize+1):
words.append(seq[i:i+wsize])
words=set(words)
return list(words)
def find_occurs(seq,word):
cnt=seq.count(word)
locs=[]
k=0
for i in range(cnt):
k=seq.find(word,k)
locs.append(k)
k+=1
return locs
def all_dists(locs):
dists=[]
for i in range(len(locs)):
for j in range(i):
dists.append(locs[i]-locs[j])
return dists
def histogram(upper,dists):
nbins=upper+1
hist=[]
for i in range(nbins):
hist.append(0)
for i in range(len(dists)):
k=dists[i]
if 0<k<nbins:
hist[k]+=1
return hist
def find_repeats(hist,word,gamma=10):
mx=hist.max()
here=hist.argmax()
if mx<gamma:
ok=0
return()
else:
return(mx,word,here)
seq="tgtatacacactgtatacacacactgtatac"
wsize=2
words=word_extract(seq,wsize)
for i in range (len(words)):
locs=find_occurs(seq,words[i])
dists=all_dists(locs)
hist=histogram(20,dists)
print (words[i],locs,dists)
print (hist)