def format_title(title):
''.join(map(lambda x: x if (x.isupper() or x.islower()) else '_', title.strip()))
还有什么更快的吗?
答案 0 :(得分:20)
更快捷的方法是使用str.translate()
这比你的方式快〜50倍
# You only need to do this once
>>> title_trans=''.join(chr(c) if chr(c).isupper() or chr(c).islower() else '_' for c in range(256))
>>> "abcde1234!@%^".translate(title_trans)
'abcde________'
# Using map+lambda
$ python -m timeit '"".join(map(lambda x: x if (x.isupper() or x.islower()) else "_", "abcd1234!@#$".strip()))'
10000 loops, best of 3: 21.9 usec per loop
# Using str.translate
$ python -m timeit -s 'titletrans="".join(chr(c) if chr(c).isupper() or chr(c).islower() else "_" for c in range(256))' '"abcd1234!@#$".translate(titletrans)'
1000000 loops, best of 3: 0.422 usec per loop
# Here is regex for a comparison
$ python -m timeit -s 'import re;transre=re.compile("[\W\d]+")' 'transre.sub("_","abcd1234!@#$")'
100000 loops, best of 3: 3.17 usec per loop
这是unicode的版本
# coding: UTF-8
def format_title_unicode_translate(title):
return title.translate(title_unicode_trans)
class TitleUnicodeTranslate(dict):
def __missing__(self,item):
uni = unichr(item)
res = u"_"
if uni.isupper() or uni.islower():
res = uni
self[item] = res
return res
title_unicode_trans=TitleUnicodeTranslate()
print format_title_unicode_translate(u"Metallica Μεταλλικα")
请注意,希腊字母计为大写和小写,因此不会替换它们。 如果要替换它们,只需将条件更改为
if item<256 and (uni.isupper() or uni.islower()):
答案 1 :(得分:17)
import re
title = re.sub("[\W\d]", "_", title.strip())
应该更快。
如果您想用一个下划线替换一系列相邻的非字母,请使用
title = re.sub("[\W\d]+", "_", title.strip())
而不是更快。
我刚刚进行了时间比较:
C:\>python -m timeit -n 100 -s "data=open('test.txt').read().strip()" "''.join(map(lambda x: x if (x.isupper() or x.islower()) else '_', data))"
100 loops, best of 3: 4.51 msec per loop
C:\>python -m timeit -n 100 -s "import re; regex=re.compile('[\W\d]+'); data=open('test.txt').read().strip()" "title=regex.sub('_',data)"
100 loops, best of 3: 2.35 msec per loop
这也适用于Unicode字符串(在Python 3下,\W
匹配任何不是Unicode字符的字符。在Python 2下,你必须另外设置UNICODE
标志为此)。
答案 2 :(得分:2)
而不是(x.isupper() or x.islower())
您应该能够使用x.isalpha()
。 isalpha()
方法可能会为True
返回'_'
(我不记得它是否存在)但是您最终会用{{1}替换'_'
所以没有伤害。 (感谢你指出这一点,KennyTM。)
答案 3 :(得分:1)
由于我自己的原因对此感到好奇,我编写了一个快速脚本来测试此处列出的不同方法,同时删除我预期(错误)将加速原始解决方案的lambda。
简短版本是str.translate方法将其他方法吹走。顺便提一下,正则表达式解决方案虽然紧随其后,但如上所述是正确的。
这是我的测试程序:
import re
from time import time
def format_title(title):
return ''.join(map(lambda x: x if (x.isupper() or x.islower()) else "_",
title.strip()))
def format_title_list_comp(title):
return ''.join([x if x.isupper() or x.islower() else "_" for x in
title.strip()])
def format_title_list_comp_is_alpha(title):
return ''.join([x if x.isalpha() else "_" for x in title.strip()])
def format_title_is_alpha(title):
return ''.join(map(lambda x: x if x.isalpha() else '_', title.strip()))
def format_title_no_lambda(title):
def trans(c):
if c.isupper() or c.islower():
return c
return "_"
return ''.join(map(trans, title.strip()))
def format_title_no_lambda_is_alpha(title):
def trans(c):
if c.isalpha():
return c
return "_"
return ''.join(map(trans, title.strip()))
def format_title_re(title):
return re.sub("[\W\d]+", "_", title.strip())
def format_title_re_corrected(title):
return re.sub("[\W\d]", "_", title.strip())
TITLE_TRANS = ''.join(chr(c) if chr(c).isalpha() else '_' for c in range(256))
def format_title_with_translate(title):
return title.translate(TITLE_TRANS)
ITERATIONS = 200000
EXAMPLE_TITLE = "abc123def_$%^!FOO BAR*bazx-bif"
def timetest(f):
start = time()
for i in xrange(ITERATIONS):
result = f(EXAMPLE_TITLE)
diff = time() - start
return result, diff
baseline_result, baseline_time = timetest(format_title)
def print_result(f, result, time):
if result == baseline_result:
msg = "CORRECT"
else:
msg = "INCORRECT"
diff = time - baseline_time
if diff < 0:
indicator = ""
else:
indicator = "+"
pct = (diff / baseline_time) * 100
print "%s: %0.3fs %s%0.3fs [%s%0.4f%%] (%s - %s)" % (
f.__name__, time, indicator, diff, indicator, pct, result, msg)
print_result(format_title, baseline_result, baseline_time)
print "----"
for f in [format_title_is_alpha,
format_title_list_comp,
format_title_list_comp_is_alpha,
format_title_no_lambda,
format_title_no_lambda_is_alpha,
format_title_re,
format_title_re_corrected,
format_title_with_translate]:
alt_result, alt_time = timetest(f)
print_result(f, alt_result, alt_time)
以下是结果:
format_title: 3.121s +0.000s [+0.0000%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
----
format_title_is_alpha: 2.336s -0.785s [-25.1470%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_list_comp: 2.369s -0.751s [-24.0773%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_list_comp_is_alpha: 1.735s -1.386s [-44.4021%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_no_lambda: 2.992s -0.129s [-4.1336%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_no_lambda_is_alpha: 2.377s -0.744s [-23.8314%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_re: 1.290s -1.831s [-58.6628%] (abc_def__FOO_BAR_bazx_bif - INCORRECT)
format_title_re_corrected: 1.338s -1.782s [-57.1165%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
format_title_with_translate: 0.098s -3.022s [-96.8447%] (abc___def_____FOO_BAR_bazx_bif - CORRECT)
答案 4 :(得分:0)
import string,sys
letters=string.letters
mystring = list("abc134#$@e##$%%$*&(()#def")
for n,c in enumerate(mystring):
if not c in letters:
mystring[n]="_"
print ''.join(mystring)