Question

我制作了将文本拆分为n-grmas（搭配）的算法，并计算了此搭配的概率和其他统计数据。当文件超过50兆字节时，需要花费很长时间来计算可能有一个将有助于改进它。

import math
import re
import csv
from itertools import zip_longest
from datetime import datetime


def tokenize(input_file, encoding):
    lst =[]
    with open(input_file, 'r', encoding=encoding) as f:
        for sent in f:
            sent = sent.lower()
            sent = re.sub("[A-z0-9\'\"`\|\/\+\#\,\)\(\?\!\B\-\:\=\;\.\«\»\—\@]", '', sent)
            sent = re.findall('\w+', sent)
            for word in sent:
            lst.append(word)
    return lst


def ngrams_split(lst, n):
    counts = dict()
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1
    return counts


def list_add(counts):
    ngrams = []
    for key, val in counts.items():
        ngrams.append((val, key))
    return ngrams


def gram_add(lst, n):
    ng = []
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        ng.append(gram)
    return ng


def two_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words)
            num = key*n_words
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            mi = math.pow(math.log(num/(f1*f2), 10), 2)
            ngram_prob = math.log(key/f1, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def three_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c2gram = ng.count(val.split()[0] + " " + val.split()[1])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            mi = math.pow(math.log(num/(f1*f2*f3), 10), 2)
            ngram_prob = math.log(key/c2gram, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def four_grams_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng2 = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c1gram = ng2.count(val.split()[0] + " " + val.split()[1])
            c2gram = ng2.count(val.split()[1] + " " + val.split()[2])
            c3gram = ng2.count(val.split()[2] + " " + val.split()[3])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            f4 = lst.count(val.split()[3])
            mi = math.pow(math.log(num/(f1*f2*f3*f4), 10), 2)
            prob1 = c1gram/f1
            prob2 = c2gram/f2
            prob3 = c3gram/f3
            ngram_prob = math.log(prob1, 10) + math.log(prob2, 10) +    math.log(prob3, 10)
           output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def n_grams_stat(input_file, encoding, n_filter, n):
    output_file = []
    if n == 2:
        for i in two_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 3:
        for i in three_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 4:
        for i in four_grams_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    return output_file

start_time = datetime.now()
for a, b, c, d, e in n_grams_stat("/home/yan/PycharmProjects/vk/piidsluhano/men_pidsluhano.txt",'utf-8', n_filter=3, n=4):
    print(a, b, c, d, e)
    with open("/home/yan/PycharmProjects/vk/piidsluhano/men_4grams", 'dwwaa') as f:
        f.write(str(a)  +", "+ str(b) + ', '+ str(c) + ", " + str(d) + ", " + str(e) + '\n ')
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

如何改进用python编写的算法？

0 个答案: