Question

我目前正在研究Python脚本，该脚本可帮助我从大型句子数据集中过滤不良数据（用于机器翻译）。到目前为止，它已经完成了工作，但是当我进入具有100k +句子的数据集时，就会出现问题。它开始的速度足够快，但是逐渐变慢，以至于要完成文档处理需要花费数天的时间。我已经评论了每个功能的作用，但需要快速分解：

它以一种语言从文件（源）中读取一个句子，并以另一种语言（目标）来获取相应的行
它检查以确保两个文件中的句子都不相同，在文件中不重复，具有特定长度并且包含一定比例的字符。
如果不满足任何个条件，则不会将这些句子放入新文件（new_src，new_tgt）中，我们将继续查找文件中的下一个句子，如果它们确实遇到了他们，他们确实进入了新文件。

#!/usr/bin/env python3

import re
import linecache
#from numba import jit, cuda

#Let User define path
path = "/home/justin/opus.txt/"

#Let User define files
source = path+input("Source input:")
target = path+input("Target input:")
new_src = path+input("Source destination:")
new_tgt = path+input("Target destination:")

num = 0
str1 = ""


#Gets Line number of current source sentence
def line_no(file, sent):
    with open(file, "r") as myFile:
        for num, line in enumerate(myFile, 1):
            if sent in line:
                return num

#Writes successful sentences to files (args are new src/tgt files and tgt sentence)
def accept_data(dest_src, dest_tgt, src, tgt):
    with open(dest_src, "a+") as output:
        output.write(src)
    with open(dest_tgt, "a+") as tgtout:
        tgtout.write(tgt)

#Checks to see if the source/target sentences don't match each other
def copy_check(src, tgt):
    if src == tgt:
        #print("The sentences are the same")
        return False
    else:
        return True

#Check to see if a sentence is already in a file
def duplicate_check(src, tgt):
    if len(re.findall(source, src)) > 1 or len(re.findall(target, tgt)) > 1:
        print("This is a dupicate")
        return False
    else:
        return True


def sentence_ratio(src, tgt, threshold=0.6):
    short, long = sorted([len(x) for x in [src, tgt]])
    #print(short/long)
    if short/long < threshold:
        return False
    else:
        return True


def length_check(src, tgt, low_threshold=3, high_threshold=25):
    len_s = len(src.split())
    len_t = len(tgt.split())
    if len_s < low_threshold or len_s > high_threshold or len_t < low_threshold or len_t > high_threshold:
        return False
    else:
        return True


#Check the ratio of sentence characters
def char_ratio_check(src, tgt):
    alpha = r"([A-Z]|[a-z])"
    digits = r"\d"
    whitespace = r"\s"
    total = 0
    iter = 0

    # Count the number of letters, numbers and symbols in a sentence
    for lang in src, tgt:
        num_count = 0
        other_count = 0
        alpha_count = 0
        char_count = 0
        for char in lang:

            if re.match(alpha, char):
                alpha_count += 1
                char_count += 1
            elif re.match(digits, char):
                num_count += 1
                char_count += 1
            elif re.match(whitespace, char):
                pass
            else:
                other_count += 1
                char_count += 1

        # Check to see if 50% or more of the sentence are numbers and letters or if it's all non-letters
        if (alpha_count + num_count) / char_count < 0.5 or num_count + other_count == num_count + alpha_count + other_count:
            #print("This sentence has a bad character ratio")
            total += 1
            # print("This sentence is fine")

        iter += 1

        #Checks for the 2nd iteration. Only proceeds with null int Total value
        if iter == 2:
            if not total:
                accept_data(new_src, new_tgt, src, tgt)
                #print("sentence {} is ok".format(line_no(source,src)))


if __name__ == '__main__':
    with open(source, "r+") as f:
        for src_sent in f:
            tgt_sent = linecache.getline(target, line_no(source, src_sent))
            tgt_sent = str1.join(tgt_sent)
            print(src_sent+"\n"+tgt_sent)
            print(line_no(source, src_sent))
            if copy_check(src_sent, tgt_sent):
                pass
            else:
                print("Let's Continue - 1")
                continue

            if duplicate_check(src_sent, tgt_sent):
                pass
            else:
                print("Let's Continue - 2")
                continue

            if sentence_ratio(src_sent, tgt_sent):
                pass
            else:
                print("Let's Continue - 3")
                continue

            if length_check(src_sent, tgt_sent):
                pass
            else:
                print("Let's Continue - 4")
                continue

            char_ratio_check(src_sent, tgt_sent)

如果我的代码中的某些地方出现问题，我真的会道歉，我仍然不是最好的，而且我只以为我将是唯一查看代码的人。如果我不得不猜测，问题可能出在读取/附加到文件上，但我不确定。

我很乐意为此提供任何帮助，因为我不认为这是处理未来数据集的可持续方式。

亲切的问候，

Justin

Python脚本逐渐变慢

0 个答案: