我想在更大的字符串中找到第一个子字符串索引。我只希望它匹配整个单词,我希望它不区分大小写,除了我希望它将CamelCase视为单独的单词。
下面的代码可以解决问题,但速度很慢。我想加快速度。有什么建议?我正在尝试一些正则表达式的东西,但找不到处理所有边缘情况的东西。
def word_start_index(text, seek_word):
start_index = 0
curr_word = ""
def case_change():
return curr_word and ch.isupper() and curr_word[-1].islower()
def is_match():
return curr_word.lower() == seek_word.lower()
for i, ch in enumerate(text):
if case_change() or not ch.isalnum():
if is_match():
return start_index
curr_word = ""
start_index = None
if ch.isalnum():
if start_index is None:
start_index = i
curr_word += ch
if is_match():
return start_index
if __name__ == "__main__":
# 01234567890123456789012345
test_text = "a_foobar_FooBar baz golf_CART"
test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]
for word in test_words:
match_start = word_start_index(test_text, word)
print match_start, word
输出:
0 a
9 foo
12 bar
16 baz
20 golf
25 cart
None fred
答案 0 :(得分:3)
word_emitter
(下面)接受一个文本字符串,并在找到它们时产生小写的“单词”,一次一个(连同它们的位置)。
它用空格替换所有下划线。然后它将文本拆分为一个列表。例如,
"a_foobar_FooBar baz golf_CART Foo"
变为
['a', 'foobar', 'FooBar', 'baz', 'golf', 'CART', 'Foo']
当然,您还希望将camelCase单词视为单独的单词。
因此,对于上面列表中的每个部分,我们使用正则表达式模式'(.*[a-z])(?=[A-Z])'
分裂camelCase的话。此正则表达式使用re
模块的前瞻操作符(?=...)
。
也许这是整个事情中最棘手的部分。
word_emitter
一次生成一个单词及其相关位置。
一旦你有一个将文本分成“单词”的功能,剩下的就很容易了。
我也会切换循环的顺序,所以你只需循环遍历test_text一次。如果test_text与test_words相比很长,这将加快速度。
import re
import string
import itertools
nonspace=re.compile('(\S+)')
table = string.maketrans(
'_.,!?;:"(){}@#$%^&*-+='+"'",
' ',
)
def piece_emitter(text):
# This generator splits text into 2-tuples of (positions,pieces).
# Given "a_foobar_FooBar" it returns
# ((0,'a'),
# (2,'foobar'),
# (9,'FooBar'),
# )
pos=0
it=itertools.groupby(text,lambda w: w.isspace())
for k,g in it:
w=''.join(g)
w=w.translate(table)
it2=itertools.groupby(w,lambda w: w.isspace())
for isspace,g2 in it2:
word=''.join(g2)
if not isspace:
yield pos,word
pos+=len(word)
def camel_splitter(word):
# Given a word like 'FooBar', this generator yields
# 'Foo', then 'Bar'.
it=itertools.groupby(word,lambda w: w.isupper())
for k,g in it:
w=''.join(g)
if len(w)==1:
try:
k1,g1=next(it)
w+=''.join(g1)
except StopIteration:
pass
yield w
def word_emitter(piece):
# Given 'getFooBar', this generator yields in turn the elements of the sequence
# ((0,'get'),
# (0,'getFoo'),
# (0,'getFooBar'),
# (3,'Foo'),
# (3,'FooBar'),
# (6,'Bar'),
# )
# In each 2-tuple, the number is the starting position of the string,
# followed by the fragment of camelCase word generated by camel_splitter.
words=list(camel_splitter(piece))
num_words=len(words)
for i in range(0,num_words+1):
prefix=''.join(words[:i])
for step in range(1,num_words-i+1):
word=''.join(words[i:i+step])
yield len(prefix),word
def camel_search(text,words):
words=dict.fromkeys(words,False)
for pos,piece in piece_emitter(text):
if not all(words[test_word] for test_word in words):
for subpos,word in word_emitter(piece):
for test_word in words:
if not words[test_word] and word.lower() == test_word.lower():
yield pos+subpos,word
words[test_word]=True
break
else:
break
for word in words:
if not words[word]:
yield None,word
if __name__ == "__main__":
# 01234567890123456789012345
test_text = "a_foobar_FooBar baz golf_CART"
test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]
for pos,word in camel_search(test_text,test_words):
print pos,word.lower()
以下是我用来检查程序的单元测试:
import unittest
import sys
import camel
import itertools
class Test(unittest.TestCase):
def check(self,result,answer):
for r,a in itertools.izip_longest(result,answer):
if r!=a:
print('%s != %s'%(r,a))
self.assertTrue(r==a)
def test_piece_emitter(self):
tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz",
((0,'a'),
(2,'foobar'),
(9,'FooBar'),
(16,'baz'),
(21,'golf'),
(26,'CART'),
(31,'Foo'),
(36,'food'),
(42,'getFooBaz'),
)
),
)
for text,answer in tests:
result=list(camel.piece_emitter(text))
print(result)
self.check(result,answer)
def test_camel_splitter(self):
tests=(('getFooBar',('get','Foo','Bar')),
('getFOObar',('get','FOO','bar')),
('Foo',('Foo',)),
('getFoo',('get','Foo')),
('foobar',('foobar',)),
('fooBar',('foo','Bar')),
('FooBar',('Foo','Bar')),
('a',('a',)),
('fooB',('foo','B')),
('FooB',('Foo','B')),
('FOOb',('FOO','b')),
)
for word,answer in tests:
result=camel.camel_splitter(word)
self.check(result,answer)
def test_word_emitter(self):
tests=(("a",
((0,'a'),) ),
('getFooBar',
((0,'get'),
(0,'getFoo'),
(0,'getFooBar'),
(3,'Foo'),
(3,'FooBar'),
(6,'Bar'),
)
)
)
for text,answer in tests:
result=list(camel.word_emitter(text))
print(result)
self.check(result,answer)
def test_camel_search(self):
tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz",
("a", "foo", "bar", "baz", "golf", "cart", "fred", "food",
'FooBaz'),
((0,'a'),
(9,'Foo'),
(12,'Bar'),
(16,'baz'),
(21,'golf'),
(26,'CART'),
(36,'food'),
(45,'FooBaz'),
(None,'fred')
)
),
("\"Foo\"",('Foo',),((1,'Foo'),)),
("getFooBar",('FooBar',),((3,'FooBar'),)),
)
for text,search_words,answer in tests:
result=list(camel.camel_search(text,search_words))
print(result)
self.check(result,answer)
if __name__ == '__main__':
unittest.main(argv = unittest.sys.argv + ['--verbose'])
答案 1 :(得分:2)
如果我用正则表达式做这个,我可能会这样做:
def word_start_index2(text, seek_word):
camel_case = seek_word[0].upper() + seek_word[1:].lower()
seek_word_i = ''.join('[' + c.lower() + c.upper() + ']'
for c in seek_word)
regex1 = r'(?:(?<=[^a-zA-Z])|^)' + seek_word_i + r'(?=$|[^a-zA-Z])'
regex2 = r'(?:(?<=[a-z]|[^A-Z])|^)' + camel_case + r'(?=$|[A-Z]|[^a-z])'
regex = '%s|%s' % (regex1, regex2)
import re
m = re.search(regex, text)
if not m:
return None
else:
return m.start()
我没有针对您的版本对此进行性能测试,但您可以尝试查看它是好还是坏,让我们知道。
我的答案可能会在某些边缘案例中给出不同的输出,但在您的评论中,您说您并不关心这些案例。
另外,我尝试使用符号(?i)
将正则表达式的一部分标记为不区分大小写,但由于某种原因,这无法正常工作。我无法解释原因。
最终的自我挑剔:该函数需要验证其参数,但为清楚起见,省略了此代码。您应该至少为以下内容添加检查:
答案 2 :(得分:1)
使用索引加快搜索速度: - )
from collections import defaultdict
class IndexedText(object):
""" a indexed text """
def __init__(self, text):
self.text = text
self._index()
def word_start_index(self, word):
l = len(word)
w = word.lower()
return self.index[word]
def _index(self):
self.index = defaultdict( list )
def index( word, pos):
self.index[word.lower()].append( pos )
start = 0
it = enumerate(self.text)
lpos, lchar = it.next()
WS = (' ','_')
for pos, char in it:
if lchar in WS and char not in WS:
index( self.text[start:lpos], start )
start = pos
elif lchar.islower() and char.isupper(): # camelcase
index( self.text[start:pos], start )
start = pos
lpos, lchar = pos, char
# last word is missing
index( self.text[start:], start )
if __name__ == "__main__":
# 01234567890123456789012345
test_text = "a_foobar_FooBar baz golf_CART"
test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"]
index = IndexedText( test_text )
for word in test_words:
match_start = index.word_start_index( word )
print match_start, word