这是我能想到的两个版本。当两个单词都是常见的时(例如“是”和“the”,版本1的n1 * n2缩放将是一个问题),并且对恶意输入(例如只有两个单词的文件)更健壮,V2是优选的。但是对于更有趣的查询(比如“大”和“动物”),v1同样快,我可以想到更真实的语义问题,而v2根本不会起作用,而v1会起作用。有没有办法加快速度?
导入时间 T1 = timeit.default_timer()
def distance(version,filename,wordOne,wordTwo):
f = open(filename, 'rU')
text = f.read()
f.close()
index = 0
distance = index
version = int(version)
print 'inputs', filename, wordOne, wordTwo
countOne = 0
countTwo = 0
print 'version', version
if version == 1:
word_pos = {}
for word in text.split():
if word in [wordOne, wordTwo]:
if word in word_pos.keys():
word_pos[word].append(index)
else:
word_pos[word] = [index]
index += 1
countOne = len(word_pos[wordOne])
countTwo = len(word_pos[wordTwo])
distances = []
low = 0
high = index
for posOne in word_pos[wordOne]:
for posTwo in word_pos[wordTwo]:
#shrink innner loop by distance?:
#for posTwo in range(int(posOne-distance), (posOne+distance)):
#if abs(posOne-posTwo) < distance:
#distance = abs(posOne-posTwo)
distances.append(abs(posOne-posTwo))
distance = min(distances)
elif version == 2:
switch = 0
indexOne = 0
indexTwo = 0
distance = len(text)
for word in text.split():
if word == wordOne:
indexOne = index
countOne += 1
if word == wordTwo:
indexTwo = index
countTwo += 1
if indexOne != 0 and indexTwo != 0:
if distance > abs(indexOne-indexTwo):
distance = abs(indexOne - indexTwo)
index += 1
t2 = timeit.default_timer()
print 'Delta t:', t2 - t1
print 'number of words in text:', index
print 'number of occurrences of',wordOne+':', countOne
print 'number of occurrences of',wordTwo+':', countTwo
if countOne < 1 or countTwo < 1:
print 'not all words are present'
return 1
print 'Shortest distance between \''+wordOne+'\' and \''+wordTwo+'\' is', distance, 'words'
return distance
答案 0 :(得分:1)
v2中昂贵的部分是if indexOne != 0 ...
块。一旦找到wordOne
和wordTwo
,文本中的剩余单词就会被调用多次。使用switch变量(我发现你有意使用它:)它可以移动它,如果阻塞到if word == wordOne
和if word == wordTwo
。在这种情况下,块的调用小于n1 + n2次。
这是代码。请注意,我们不再需要检查索引。
elif version == 3:
last_word_is_one = None
indexOne = 0
indexTwo = 0
countOne = 0
countTwo = 0
distance = len(text)
for word in text.split():
if word == wordOne:
indexOne = index
countOne += 1
if last_word_is_one == False:
if distance > abs(indexOne-indexTwo):
distance = abs(indexOne - indexTwo)
last_word_is_one = True
if word == wordTwo:
indexTwo = index
countTwo += 1
if last_word_is_one == True:
if distance > abs(indexOne-indexTwo):
distance = abs(indexOne - indexTwo)
last_word_is_one = False
index += 1