Python Regex匹配错误的字符串

时间:2018-01-17 00:03:32

标签: python regex match

我有以下python脚本使用' AND',' OR'进行正则表达式匹配。功能:

class PyBoolReException(Exception):

    def __init__(self, value):
        self.value = value

    def __str__(self):
        return str(self.value)


class PyBoolRe:

    def __init__(self, boolstr):
        # Require whitespace  before words?
        self.__needspace = True
        # whitespace re
        self._wspre = re.compile('^\s*$')
        # create regexp string
        self.__rexplist = []
        oparct = boolstr.count('(')
        clparct = boolstr.count(')')
        if oparct != clparct:
            raise PyBoolReException, 'Mismatched parantheses!'

        self.__parse(boolstr)
        # if NOT is one of the members, reverse
        # the list
        # print self.__rexplist
        if '!' in self.__rexplist:
            self.__rexplist.reverse()

        s = self.__makerexp(self.__rexplist)
        # print s
        self.__rexp = re.compile(s)

    def match(self, data):
        """ Match the boolean expression, behaviour
        is same as the 'match' method of re """

        return self.__rexp.match(data)

    def search(self, data):
        """ Search the boolean expression, behaviour
        is same as the 'search' method of re """

        return self.__rexp.search(data)

    def __parse(self, s):
        """ Parse the boolean regular expression string
        and create the regexp list """

        # The string is a nested parantheses with
        # any character in between the parens.

        scopy = s[:]
        oparmatch, clparmatch = False, False

        # Look for a NOT expression
        index = scopy.rfind('(')

        l = []
        if index != -1:
            oparmatch = True
            index2 = scopy.find(')', index)
            if index2 != -1:
                clparmatch = True
                newstr = scopy[index+1:index2]
                # if the string is only of whitespace chars, skip it
                if not self._wspre.match(newstr):
                    self.__rexplist.append(newstr)
                replacestr = '(' + newstr + ')'
                scopy = scopy.replace(replacestr, '')

                self.__parse(scopy)

        if not clparmatch and not oparmatch:
            if scopy: self.__rexplist.append(scopy)

    def is_inbetween(self, l, elem):
        """ Find out if an element is in between
        in a list """

        index = l.index(elem)
        if index == 0:
            return False

        if index>2:
            if index in range(1, len(l) -1):
                return True
            else:
                return False
        else:
            return True

    def __makenotexpr(self, s):
        """ Make a NOT expression """

        if s.find('!') == 0:
            return ''.join(('(?!', s[1:], ')'))
        else:
            return s

    def __makerexp(self, rexplist):
        """ Make the regular expression string for
        the boolean match from the nested list """


        is_list = True

        if type(rexplist) is str:
            is_list = False
            elem = rexplist
        elif type(rexplist) is list:
            elem = rexplist[0]

        if type(elem) is list:
            elem = elem[0]

        eor = False
        if not is_list or len(rexplist) == 1:
            eor = True

        word_str = '.*'

        s=''
        # Implementing NOT
        if elem == '!':
            return ''.join(('(?!', self.__makerexp(rexplist[1:]), ')'))
        # Implementing OR
        elif elem.find(' | ') != -1:
            listofors = elem.split(' | ')

            for o in listofors:
                index = listofors.index(o)
                in_bet = self.is_inbetween(listofors, o)

                if o:
                    o = self.__makenotexpr(o)
                    if in_bet:
                        s = ''.join((s, '|', word_str, o, '.*'))
                    else:
                        s = ''.join((s, word_str, o, '.*'))

        # Implementing AND
        elif elem.find(' & ') != -1:
            listofands = elem.split(' & ')

            for a in listofands:
                index = listofands.index(a)
                in_bet = self.is_inbetween(listofands, a)                

                if a:
                    a = self.__makenotexpr(a)                   
                    s = ''.join((s, word_str, a, '.*'))

        else:
            if elem:
                elem = self.__makenotexpr(elem)             
                s = ''.join((elem, '.*'))

        if eor:
            return s
        else:
            return ''.join((s, self.__makerexp(rexplist[1:])))

当搜索短语如下:

p = PyBoolRe('Python | Perl')

s1 = 'Guido invented Python'
s2 = 'Guido Perl'

if p.match(s1):
   print 'Match found for first string'
else:
   print 'No match found for first string'

if p.match(s2):
   print 'Match found for second string'
else:
   print 'No match found for second string'

然后s1和& s2匹配

但是当搜索短语是:

p = PyBoolRe('Guido & (Python | Perl)')

s1 = 'Guido invented Python'
s2 = 'Guido Perl is great'

如果s1或s2有"Guido Python""Guido Perl",那么它应匹配。 s2有,但它不匹配。另一方面,它匹配s1,它不应该。那是为什么?

请帮忙!!我怎样才能让它发挥作用?

1 个答案:

答案 0 :(得分:0)

您生成的表达式是

.*Python.*|.*Perl.*.*Guido.*

虽然看起来像

(?=.*Guido.*)(?:.*Python.*|.*Perl.*)

所以解析器需要一些修改。

1)x|y应该包含在(?:...)中(至少在另一个块中使用时)。否则,|不幸地在正则表达式中获取全局优先级。

2)x & y应转换为(?=x)y(尾随上下文可用于表达正则表达式之间的and