快速迭代Python中的大量单词

时间:2015-10-20 22:03:04

标签: python dictionary iteration

我写了一个程序,应该检查一个包含大约50.000个其他词典的字典。在那些词中,其中一个键有一个单词列表作为值。现在我想迭代那些找到与某些查询最相似的单词的单词。但是它需要很长时间才能完成。我怎样才能加快这个过程?

import pickle,sys

def levenshtein_distance(first, second):
        if first == second: return 0
        elif len(first) == 0: return len(second)
        elif len(second) == 0: return len(first)
        v0 = [None] * (len(second) + 1)
        v1 = [None] * (len(second) + 1)
        for i in range(len(v0)):
            v0[i] = i
        for i in range(len(first)):
            v1[0] = i + 1
            for j in range(len(second)):
                cost = 0 if first[i] == second[j] else 1
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            for j in range(len(v0)):
                v0[j] = v1[j]

        return v1[len(second)]

def remove_duplicates(seq):
    seen = set()
    seen_add = seen.add
    return [ x for x in seq if not (x in seen or seen_add(x))]

def main():
    dict_pkld = pickle.load(open('woordjes2.pkl', 'rb'))
    postlist = pickle.load(open('postlist.pkl', 'rb'))
    lowest_distance = []
    words = []
    f = sys.stdin.readlines()
    for line in f:
            woorden = line.rstrip()
            query1 = woorden
            print ('Uw query is: "' + query1 + '"')
            print ('Een woord wat hier op lijkt is: \n')
            for sub in dict_pkld.values():
                    woorden = sub['termen']
                    if not query1 in woorden:
                            for woord in woorden:
                                    x = levenshtein_distance(query1, woord)
                                    temp_list_number = []
                                    temp_list_word = []
                                    if lowest_distance:
                                            for number, word in zip(lowest_distance, words):
                                                    if number > x:
                                                            #print ('x is kleiner')
                                                            loc = lowest_distance.index(number)
                                                            lowest_distance[loc] = x
                                                            words[loc] = woord
                                                    elif number == x:
                                                            #print ('x is gelijk')
                                                            temp_list_number.append(x)
                                                            temp_list_word.append(woord)
                                                    else:
                                                            #print ('Niks')
                                                            pass
                                    else:
                                            #print ('lijst is leeg')
                                            lowest_distance.append(x)
                                            words.append(woord)
                                    for item, woordje in zip(temp_list_number, temp_list_word):
                                            if not woordje in words:
                                                    lowest_distance.append(item)
                                                    words.append(woordje)


                    else:
                            pass

            words_new = remove_duplicates(words)
            for woordje in words_new:
                    print ('-' + woordje)
            print ('\nDeze woorden hebben een Levenshtein afstand van ' + str(lowest_distance[0]))

            print ('\nWe zoeken nu naar tweets die het eerste woord in deze lijst bevatten: \n')
            result = postlist[words_new[0]]
            for r in result:
                    print (dict_pkld[r]['tekst'] + '\n')


main()

提前致谢!

0 个答案:

没有答案