我有以下python脚本使用' AND',' OR'进行正则表达式匹配。功能:
class PyBoolReException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class PyBoolRe:
def __init__(self, boolstr):
# Require whitespace before words?
self.__needspace = True
# whitespace re
self._wspre = re.compile('^\s*$')
# create regexp string
self.__rexplist = []
oparct = boolstr.count('(')
clparct = boolstr.count(')')
if oparct != clparct:
raise PyBoolReException, 'Mismatched parantheses!'
self.__parse(boolstr)
# if NOT is one of the members, reverse
# the list
# print self.__rexplist
if '!' in self.__rexplist:
self.__rexplist.reverse()
s = self.__makerexp(self.__rexplist)
# print s
self.__rexp = re.compile(s)
def match(self, data):
""" Match the boolean expression, behaviour
is same as the 'match' method of re """
return self.__rexp.match(data)
def search(self, data):
""" Search the boolean expression, behaviour
is same as the 'search' method of re """
return self.__rexp.search(data)
def __parse(self, s):
""" Parse the boolean regular expression string
and create the regexp list """
# The string is a nested parantheses with
# any character in between the parens.
scopy = s[:]
oparmatch, clparmatch = False, False
# Look for a NOT expression
index = scopy.rfind('(')
l = []
if index != -1:
oparmatch = True
index2 = scopy.find(')', index)
if index2 != -1:
clparmatch = True
newstr = scopy[index+1:index2]
# if the string is only of whitespace chars, skip it
if not self._wspre.match(newstr):
self.__rexplist.append(newstr)
replacestr = '(' + newstr + ')'
scopy = scopy.replace(replacestr, '')
self.__parse(scopy)
if not clparmatch and not oparmatch:
if scopy: self.__rexplist.append(scopy)
def is_inbetween(self, l, elem):
""" Find out if an element is in between
in a list """
index = l.index(elem)
if index == 0:
return False
if index>2:
if index in range(1, len(l) -1):
return True
else:
return False
else:
return True
def __makenotexpr(self, s):
""" Make a NOT expression """
if s.find('!') == 0:
return ''.join(('(?!', s[1:], ')'))
else:
return s
def __makerexp(self, rexplist):
""" Make the regular expression string for
the boolean match from the nested list """
is_list = True
if type(rexplist) is str:
is_list = False
elem = rexplist
elif type(rexplist) is list:
elem = rexplist[0]
if type(elem) is list:
elem = elem[0]
eor = False
if not is_list or len(rexplist) == 1:
eor = True
word_str = '.*'
s=''
# Implementing NOT
if elem == '!':
return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
# Implementing OR
elif elem.find(' | ') != -1:
listofors = elem.split(' | ')
for o in listofors:
index = listofors.index(o)
in_bet = self.is_inbetween(listofors, o)
if o:
o = self.__makenotexpr(o)
if in_bet:
s = ''.join((s, '|', word_str, o, '.*'))
else:
s = ''.join((s, word_str, o, '.*'))
# Implementing AND
elif elem.find(' & ') != -1:
listofands = elem.split(' & ')
for a in listofands:
index = listofands.index(a)
in_bet = self.is_inbetween(listofands, a)
if a:
a = self.__makenotexpr(a)
s = ''.join((s, word_str, a, '.*'))
else:
if elem:
elem = self.__makenotexpr(elem)
s = ''.join((elem, '.*'))
if eor:
return s
else:
return ''.join((s, self.__makerexp(rexplist[1:])))
当搜索短语如下:
p = PyBoolRe('Python | Perl')
s1 = 'Guido invented Python'
s2 = 'Guido Perl'
if p.match(s1):
print 'Match found for first string'
else:
print 'No match found for first string'
if p.match(s2):
print 'Match found for second string'
else:
print 'No match found for second string'
然后s1
和& s2
匹配
但是当搜索短语是:
p = PyBoolRe('Guido & (Python | Perl)')
s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'
如果s1或s2有"Guido Python"
或"Guido Perl"
,那么它应匹配。 s2有,但它不匹配。另一方面,它匹配s1,它不应该。那是为什么?
请帮忙!!我怎样才能让它发挥作用?
答案 0 :(得分:0)
您生成的表达式是
.*Python.*|.*Perl.*.*Guido.*
虽然看起来像
(?=.*Guido.*)(?:.*Python.*|.*Perl.*)
所以解析器需要一些修改。
1)x|y
应该包含在(?:...)
中(至少在另一个块中使用时)。否则,|
不幸地在正则表达式中获取全局优先级。
2)x & y
应转换为(?=x)y
(尾随上下文可用于表达正则表达式之间的and
)