Question

我有代码可以帮助我从一个大小为85k的列表中找到与另一个列表中的单词（8,1k行）匹配的单词。它使用ngram方法，仅选择适当的匹配项，然后找到最佳对。经过一段时间的正常工作后，出现内存错误。

一开始它只有一个文件，所以我将其拆分为多个文件，但似乎无济于事。我已经尝试实现gc.collector，但我尚不擅长代码编写，因此也无济于事。

import re
from scipy import spatial
from multiprocessing import Pool, freeze_support
from itertools import groupby
import time
import os
import gc

start = time.time()


def ngram(text, n=2, pad=False):
   text = text.strip()
   if pad:
      text = " %s " % text
   return set([text[i:i + n] for i in range(len(text) - n + 1)])


def create_ngram(text1, text2, n=2, pad=True):
   return ngram(text1, n=n, pad=pad), ngram(text2, n=n, pad=pad)


def cos_dist(text1_text2, q=2, pad=True):
   text1, text2 = text1_text2
   text1, text2 = create_ngram(text1, text2, n=q, pad=pad)
   full_text = list(text1.union(text2))
   v1 = [(lambda x: 1 if x in text1 else 0)(x) for x in full_text]
   v2 = [(lambda x: 1 if x in text2 else 0)(x) for x in full_text]
   return spatial.distance.cosine(v1, v2)


def prepared(good_list, bad_list):
   for bad in good_list:
      for good in bad_list:
         bad = bad.rstrip('\n')
         good = good.rstrip('\n')
         prep_bad = re.sub(r'[\\\t!@#$?/_|.]', '', bad.strip()).lower()
         prep_good = good.lower()
         yield bad, good, prep_bad, prep_good


if __name__ == '__main__':
   freeze_support()
   true_positions = open(file_with_list_two_with_matches, 'r', encoding='cp65001')
   true_pos_list = [line.rstrip('\r\n') for line in true_positions]
   true_positions.close()
   print('True list was generated')
   for (dirpath, dirnames, filenames) in os.walk(dir_path_with_splitted_files):
      for file in filenames:
         gc.collect()
         bad_positions = open(each_file_path,'r', encoding='cp65001')
         bad_pos_list = [line.rstrip('\r\n') for line in bad_positions]
         bad_positions.close()
         print('Bad list for {} was generated'.format(file))

         gen_list = []

         for i in prepared(bad_pos_list, true_pos_list):
            gen_list.append((i[2], i[3]))
         print('Pairs to compare for {} were generated'.format(file))
         with open(result_file_for_each, 'a', encoding='cp65001') as the_file:
            pool = Pool(16)
            result = pool.map(cos_dist, gen_list)
            print('Results for {} were got'.format(file))
            for (orig_bad, orig_good, prep_bad, prep_good), dist in zip(prepared(bad_pos_list, true_pos_list), result):
               if dist < 0.39:
                  the_file.write(orig_bad + '|' + prep_bad + '|' + orig_good + '|' + str(dist) + '\n')
         print('Combinations for {} were written'.format(file))

      for (dirpath, dirnames, filenames) in os.walk(dir_path_with_results):
         for fname in filenames:
            print(fname)
         with open(combinations_union_file_path, 'a', encoding='cp65001') as outfile:
            for fname in filenames:
               with open(str(dirpath + '/' + fname), 'r', encoding='cp65001') as infile:
                  for line in infile:
                     outfile.write(line)
         with open(combinations_union_file_path, 'r', encoding='cp65001') as the_file:
            print('Parsing distances file...')
            res_list_list = [line.rstrip('\n').split('|') for line in the_file]
            res_tuple_list = [(line[0], line[2], float(line[3])) for line in res_list_list]
            print('Writing out groups...')
            with open(final_result_file_path, 'a', encoding='cp65001') as result_file:
               for key, group in groupby(res_tuple_list, lambda x: x[0]):
                  sort = sorted(group, key=lambda x: x[2])[0]
                  result_file.write(sort[0] + '|' + sort[1] + '|' + str(sort[2]) + '\n')
            the_file.close()
         end = time.time()

   print('Total time {}'.format(end - start))

它给出三个或文件的结果，然后出现内存不足（70gb）并因内存错误而失败。请帮助我优化此代码。

如何优化代码以减少内存使用

0 个答案: