我写了一个程序,应该检查一个包含大约50.000个其他词典的字典。在那些词中,其中一个键有一个单词列表作为值。现在我想迭代那些找到与某些查询最相似的单词的单词。但是它需要很长时间才能完成。我怎样才能加快这个过程?
import pickle,sys
def levenshtein_distance(first, second):
if first == second: return 0
elif len(first) == 0: return len(second)
elif len(second) == 0: return len(first)
v0 = [None] * (len(second) + 1)
v1 = [None] * (len(second) + 1)
for i in range(len(v0)):
v0[i] = i
for i in range(len(first)):
v1[0] = i + 1
for j in range(len(second)):
cost = 0 if first[i] == second[j] else 1
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
for j in range(len(v0)):
v0[j] = v1[j]
return v1[len(second)]
def remove_duplicates(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if not (x in seen or seen_add(x))]
def main():
dict_pkld = pickle.load(open('woordjes2.pkl', 'rb'))
postlist = pickle.load(open('postlist.pkl', 'rb'))
lowest_distance = []
words = []
f = sys.stdin.readlines()
for line in f:
woorden = line.rstrip()
query1 = woorden
print ('Uw query is: "' + query1 + '"')
print ('Een woord wat hier op lijkt is: \n')
for sub in dict_pkld.values():
woorden = sub['termen']
if not query1 in woorden:
for woord in woorden:
x = levenshtein_distance(query1, woord)
temp_list_number = []
temp_list_word = []
if lowest_distance:
for number, word in zip(lowest_distance, words):
if number > x:
#print ('x is kleiner')
loc = lowest_distance.index(number)
lowest_distance[loc] = x
words[loc] = woord
elif number == x:
#print ('x is gelijk')
temp_list_number.append(x)
temp_list_word.append(woord)
else:
#print ('Niks')
pass
else:
#print ('lijst is leeg')
lowest_distance.append(x)
words.append(woord)
for item, woordje in zip(temp_list_number, temp_list_word):
if not woordje in words:
lowest_distance.append(item)
words.append(woordje)
else:
pass
words_new = remove_duplicates(words)
for woordje in words_new:
print ('-' + woordje)
print ('\nDeze woorden hebben een Levenshtein afstand van ' + str(lowest_distance[0]))
print ('\nWe zoeken nu naar tweets die het eerste woord in deze lijst bevatten: \n')
result = postlist[words_new[0]]
for r in result:
print (dict_pkld[r]['tekst'] + '\n')
main()
提前致谢!