检查字符串是否为非法字符的最有效方法

时间:2013-09-17 08:34:09

标签: python regex string performance illegal-characters

如果非法字符集位于许多不同的范围和个别点,那么检查字符串对这种非法集合的最有效方法是什么。

我计时两种方法,其中一种比另一种方法慢得多(检查下面的代码 - 假设我的时间没有问题)。可以改进下面的搜索模式方法,而不是限制使用正则表达式。

import re
import timeit

# match pattern
matchPat = re.compile(r'[^'
                   r'\u0000-\u0008'    # C0 block first segment
                   r'\u000B\u000C'    # allow TAB U+0009, LF U+000A, and CR U+000D
                   r'\u000E-\u001F'    # rest of C0
                   r'\u007F'           # disallow DEL U+007F
                   r'\u0080-\u009F'    # All C1 block
                   r'\u2028\u2029'     # LS and PS unicode newlines
                   r'\uD800-\uDFFF'    # surrogate block
                   r'\uFFFE\uFFFF'     # non-characters
                   r'\uFEFF]*$',       # BOM only allowed at the start of the stream
                   )

# search pattern
searchPat = re.compile(r'['
                   r'\u0000-\u0008'    # C0 block first segment
                   r'\u000B\u000C'    # allow TAB U+0009, LF U+000A, and CR U+000D
                   r'\u000E-\u001F'    # rest of C0
                   r'\u007F'           # disallow DEL U+007F
                   r'\u0080-\u009F'    # All C1 block
                   r'\u2028\u2029'     # LS and PS unicode newlines
                   r'\uD800-\uDFFF'    # surrogate block
                   r'\uFFFE\uFFFF'     # non-characters
                   r'\uFEFF]',         # BOM only allowed at the start of the stream
                   )

s = 'allow TAB 0009, LF 000A, and CR 000D -- only allowed at the start of the stream' # sample legal string

def fmatch(s):
    if matchPat.match(s):
        valid = True

def fsearch(s):
    if searchPat.search(s):
        valid = False

print ('fmatch==',timeit.timeit("fmatch(s)", setup="from __main__ import fmatch,s", number=1000000))
print ('fsearch==',timeit.timeit("fsearch(s)", setup="from __main__ import fsearch,s", number=1000000))


$ python3 valid.py
fmatch== 5.631323281995719
fsearch== 1.320517893997021

0 个答案:

没有答案