如果非法字符集位于许多不同的范围和个别点,那么检查字符串对这种非法集合的最有效方法是什么。
我计时两种方法,其中一种比另一种方法慢得多(检查下面的代码 - 假设我的时间没有问题)。可以改进下面的搜索模式方法,而不是限制使用正则表达式。
import re
import timeit
# match pattern
matchPat = re.compile(r'[^'
r'\u0000-\u0008' # C0 block first segment
r'\u000B\u000C' # allow TAB U+0009, LF U+000A, and CR U+000D
r'\u000E-\u001F' # rest of C0
r'\u007F' # disallow DEL U+007F
r'\u0080-\u009F' # All C1 block
r'\u2028\u2029' # LS and PS unicode newlines
r'\uD800-\uDFFF' # surrogate block
r'\uFFFE\uFFFF' # non-characters
r'\uFEFF]*$', # BOM only allowed at the start of the stream
)
# search pattern
searchPat = re.compile(r'['
r'\u0000-\u0008' # C0 block first segment
r'\u000B\u000C' # allow TAB U+0009, LF U+000A, and CR U+000D
r'\u000E-\u001F' # rest of C0
r'\u007F' # disallow DEL U+007F
r'\u0080-\u009F' # All C1 block
r'\u2028\u2029' # LS and PS unicode newlines
r'\uD800-\uDFFF' # surrogate block
r'\uFFFE\uFFFF' # non-characters
r'\uFEFF]', # BOM only allowed at the start of the stream
)
s = 'allow TAB 0009, LF 000A, and CR 000D -- only allowed at the start of the stream' # sample legal string
def fmatch(s):
if matchPat.match(s):
valid = True
def fsearch(s):
if searchPat.search(s):
valid = False
print ('fmatch==',timeit.timeit("fmatch(s)", setup="from __main__ import fmatch,s", number=1000000))
print ('fsearch==',timeit.timeit("fsearch(s)", setup="from __main__ import fsearch,s", number=1000000))
$ python3 valid.py
fmatch== 5.631323281995719
fsearch== 1.320517893997021