我有代码可以帮助我从一个大小为85k的列表中找到与另一个列表中的单词(8,1k行)匹配的单词。它使用ngram方法,仅选择适当的匹配项,然后找到最佳对。经过一段时间的正常工作后,出现内存错误。
一开始它只有一个文件,所以我将其拆分为多个文件,但似乎无济于事。我已经尝试实现gc.collector
,但我尚不擅长代码编写,因此也无济于事。
import re
from scipy import spatial
from multiprocessing import Pool, freeze_support
from itertools import groupby
import time
import os
import gc
start = time.time()
def ngram(text, n=2, pad=False):
text = text.strip()
if pad:
text = " %s " % text
return set([text[i:i + n] for i in range(len(text) - n + 1)])
def create_ngram(text1, text2, n=2, pad=True):
return ngram(text1, n=n, pad=pad), ngram(text2, n=n, pad=pad)
def cos_dist(text1_text2, q=2, pad=True):
text1, text2 = text1_text2
text1, text2 = create_ngram(text1, text2, n=q, pad=pad)
full_text = list(text1.union(text2))
v1 = [(lambda x: 1 if x in text1 else 0)(x) for x in full_text]
v2 = [(lambda x: 1 if x in text2 else 0)(x) for x in full_text]
return spatial.distance.cosine(v1, v2)
def prepared(good_list, bad_list):
for bad in good_list:
for good in bad_list:
bad = bad.rstrip('\n')
good = good.rstrip('\n')
prep_bad = re.sub(r'[\\\t!@#$?/_|.]', '', bad.strip()).lower()
prep_good = good.lower()
yield bad, good, prep_bad, prep_good
if __name__ == '__main__':
freeze_support()
true_positions = open(file_with_list_two_with_matches, 'r', encoding='cp65001')
true_pos_list = [line.rstrip('\r\n') for line in true_positions]
true_positions.close()
print('True list was generated')
for (dirpath, dirnames, filenames) in os.walk(dir_path_with_splitted_files):
for file in filenames:
gc.collect()
bad_positions = open(each_file_path,'r', encoding='cp65001')
bad_pos_list = [line.rstrip('\r\n') for line in bad_positions]
bad_positions.close()
print('Bad list for {} was generated'.format(file))
gen_list = []
for i in prepared(bad_pos_list, true_pos_list):
gen_list.append((i[2], i[3]))
print('Pairs to compare for {} were generated'.format(file))
with open(result_file_for_each, 'a', encoding='cp65001') as the_file:
pool = Pool(16)
result = pool.map(cos_dist, gen_list)
print('Results for {} were got'.format(file))
for (orig_bad, orig_good, prep_bad, prep_good), dist in zip(prepared(bad_pos_list, true_pos_list), result):
if dist < 0.39:
the_file.write(orig_bad + '|' + prep_bad + '|' + orig_good + '|' + str(dist) + '\n')
print('Combinations for {} were written'.format(file))
for (dirpath, dirnames, filenames) in os.walk(dir_path_with_results):
for fname in filenames:
print(fname)
with open(combinations_union_file_path, 'a', encoding='cp65001') as outfile:
for fname in filenames:
with open(str(dirpath + '/' + fname), 'r', encoding='cp65001') as infile:
for line in infile:
outfile.write(line)
with open(combinations_union_file_path, 'r', encoding='cp65001') as the_file:
print('Parsing distances file...')
res_list_list = [line.rstrip('\n').split('|') for line in the_file]
res_tuple_list = [(line[0], line[2], float(line[3])) for line in res_list_list]
print('Writing out groups...')
with open(final_result_file_path, 'a', encoding='cp65001') as result_file:
for key, group in groupby(res_tuple_list, lambda x: x[0]):
sort = sorted(group, key=lambda x: x[2])[0]
result_file.write(sort[0] + '|' + sort[1] + '|' + str(sort[2]) + '\n')
the_file.close()
end = time.time()
print('Total time {}'.format(end - start))
它给出三个或文件的结果,然后出现内存不足(70gb)并因内存错误而失败。请帮助我优化此代码。