我正在关注clrs书并遇到了" kmp算法"用于字符串匹配。我使用python(as-is)实现了它。但是,由于某种原因,它似乎不起作用。我的错在哪里?
代码如下:
def kmp_matcher(t,p):
n=len(t)
m=len(p)
# pi=[0]*n;
pi = compute_prefix_function(p)
q=-1
for i in range(n):
while(q>0 and p[q]!=t[i]):
q=pi[q]
if(p[q]==t[i]):
q=q+1
if(q==m):
print "pattern occurs with shift "+str(i-m)
q=pi[q]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
pi[1]=0
k=0
for q in range(2,m):
while(k>0 and p[k]!=p[q]):
k=pi[k]
if(p[k]==p[q]):
k=k+1
pi[q]=k
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t,p)
答案 0 :(得分:2)
这是我基于CLRs KMP算法编写的一个类,它包含了你所追求的内容。请注意,这里只接受DNA“字符”。
class KmpMatcher(object):
def __init__(self, pattern, string, stringName):
self.motif = pattern.upper()
self.seq = string.upper()
self.header = stringName
self.prefix = []
self.validBases = ['A', 'T', 'G', 'C', 'N']
#Matches the motif pattern against itself.
def computePrefix(self):
#Initialize prefix array
self.fillPrefixList()
k = 0
for pos in range(1, len(self.motif)):
#Check valid nt
if(self.motif[pos] not in self.validBases):
self.invalidMotif()
#Unique base in motif
while(k > 0 and self.motif[k] != self.motif[pos]):
k = self.prefix[k]
#repeat in motif
if(self.motif[k] == self.motif[pos]):
k += 1
self.prefix[pos] = k
#Initialize the prefix list and set first element to 0
def fillPrefixList(self):
self.prefix = [None] * len(self.motif)
self.prefix[0] = 0
#An implementation of the Knuth-Morris-Pratt algorithm for linear time string matching
def kmpSearch(self):
#Compute prefix array
self.computePrefix()
#Number of characters matched
match = 0
found = False
for pos in range(0, len(self.seq)):
#Check valid nt
if(self.seq[pos] not in self.validBases):
self.invalidSequence()
#Next character is not a match
while(match > 0 and self.motif[match] != self.seq[pos]):
match = self.prefix[match-1]
#A character match has been found
if(self.motif[match] == self.seq[pos]):
match += 1
#Motif found
if(match == len(self.motif)):
print(self.header)
print("Match found at position: " + str(pos-match+2) + ':' + str(pos+1))
found = True
match = self.prefix[match-1]
if(found == False):
print("Sorry '" + self.motif + "'" + " was not found in " + str(self.header))
#An invalid character in the motif message to the user
def invalidMotif(self):
print("Error: motif contains invalid DNA nucleotides")
exit()
#An invalid character in the sequence message to the user
def invalidSequence(self):
print("Error: " + str(self.header) + "sequence contains invalid DNA nucleotides")
exit()
答案 1 :(得分:2)
您可能想要试用我的代码:
def recursive_find_match(i, j, pattern, pattern_track):
if pattern[i] == pattern[j]:
pattern_track.append(i+1)
return {"append":pattern_track, "i": i+1, "j": j+1}
elif pattern[i] != pattern[j] and i == 0:
pattern_track.append(i)
return {"append":pattern_track, "i": i, "j": j+1}
else:
i = pattern_track[i-1]
return recursive_find_match(i, j, pattern, pattern_track)
def kmp(str_, pattern):
len_str = len(str_)
len_pattern = len(pattern)
pattern_track = []
if len_pattern == 0:
return
elif len_pattern == 1:
pattern_track = [0]
else:
pattern_track = [0]
i = 0
j = 1
while j < len_pattern:
data = recursive_find_match(i, j, pattern, pattern_track)
i = data["i"]
j = data["j"]
pattern_track = data["append"]
index_str = 0
index_pattern = 0
match_from = -1
while index_str < len_str:
if index_pattern == len_pattern:
break
if str_[index_str] == pattern[index_pattern]:
if index_pattern == 0:
match_from = index_str
index_pattern += 1
index_str += 1
else:
if index_pattern == 0:
index_str += 1
else:
index_pattern = pattern_track[index_pattern-1]
match_from = index_str - index_pattern
答案 2 :(得分:1)
试试这个:
def kmp_matcher(t, d):
n=len(t)
m=len(d)
pi = compute_prefix_function(d)
q = 0
i = 0
while i < n:
if d[q]==t[i]:
q=q+1
i = i + 1
else:
if q != 0:
q = pi[q-1]
else:
i = i + 1
if q == m:
print "pattern occurs with shift "+str(i-q)
q = pi[q-1]
def compute_prefix_function(p):
m=len(p)
pi =range(m)
k=1
l = 0
while k < m:
if p[k] <= p[l]:
l = l + 1
pi[k] = l
k = k + 1
else:
if l != 0:
l = pi[l-1]
else:
pi[k] = 0
k = k + 1
return pi
t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t, p)
答案 3 :(得分:1)
KMP 代表 Knuth-Morris-Pratt,它是一种线性时间字符串匹配算法。
注意在python中,字符串是零基,(而在书中,字符串以索引1开头)。 >
所以我们可以通过在两个字符串的开头插入一个空格来解决这个问题。
这导致了四个事实:
len
都增加了 1,因此在 循环范围中,我们不必将 +1
插入正确的间隔。 (注意在python中最后一步被排除在外);k+1
和 q+1
的值作为数组索引之前检查它们;m
的长度增加了1,在kmp_matcher
中,在打印响应之前,您必须检查this:q==m-1
; i-(m-1)
因此,基于您的原始问题,并考虑到您要求的 Cormen 的起始代码,正确的代码如下:
(注意:我在里面插入了一个匹配的模式,以及一些帮助我找到逻辑错误的调试文本):
def compute_prefix_function(P):
m = len(P)
pi = [None] * m
pi[1] = 0
k = 0
for q in range(2, m):
print ("q=", q, "\n")
print ("k=", k, "\n")
if ((k+1) < m):
while (k > 0 and P[k+1] != P[q]):
print ("entered while: \n")
print ("k: ", k, "\tP[k+1]: ", P[k+1], "\tq: ", q, "\tP[q]: ", P[q])
k = pi[k]
if P[k+1] == P[q]:
k = k+1
print ("Entered if: \n")
print ("k: ", k, "\tP[k]: ", P[k], "\tq: ", q, "\tP[q]: ", P[q])
pi[q] = k
print ("Outside while or if: \n")
print ("pi[", q, "] = ", k, "\n")
print ("---next---")
print ("---end for---")
return pi
def kmp_matcher(T, P):
n = len(T)
m = len(P)
pi = compute_prefix_function(P)
q = 0
for i in range(1, n):
print ("i=", i, "\n")
print ("q=", q, "\n")
print ("m=", m, "\n")
if ((q+1) < m):
while (q > 0 and P[q+1] != T[i]):
q = pi[q]
if P[q+1] == T[i]:
q = q+1
if q == m-1:
print ("Pattern occurs with shift", i-(m-1))
q = pi[q]
print("---next---")
print("---end for---")
txt = " bacbababaabcbab"
ptn = " ababaab"
kmp_matcher(txt, ptn)
(所以这将是正确的接受答案...)
希望能帮到你。