对ocr文本文件进行拼写检查和更正

时间:2018-03-26 09:32:33

标签: python-3.x nlp spell-checking levenshtein-distance

我正在对通过OCR生成的给定文本文件列表进行拼写检查和更正。

我一直在使用peter norvig拼写检查器和10万字的词典作为开始,但效率不高。

我是新手拼写检查

我的示例代码:

import os
from textblob import TextBlob
import re
from collections import Counter
path = "/home/avics/PycharmProjects/spell_checker/textfile/"
save_path = "/home/avics/PycharmProjects/spell_checker/output_file/"
import enchant

def words(text): return re.findall(r'\w+', text.lower())#lowercase

WORDS = Counter(words(open('/home/avics/PycharmProjects/spell_checker/google-10000-english.txt').read()))

def P(word, N=sum(WORDS.values())):
    return WORDS[word] / N


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word):
    "Generate possible spelling corrections for word.(candidates)"
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    #splits  in group of two: forming a set
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))



def write_output_to_file(corrected_word):
        output_file = open (save_path + file, 'a') 
        output_file.write (corrected_word+"\n")


def line_read(corrected_line):
    words_split =corrected_line.strip(" ").split(" ")
    print(words_split)

    corrected_word = ''
    for input_words in words_split:
        if len(input_words)<=3 :


        else:
            corrected_word = corrected_word + "  " + correction (input_words)
    write_output_to_file(corrected_word)


#read/write operations

listOfFiles = os.listdir (path)

for file in listOfFiles:
    print ("input file name: ", file)
    input_file = open(path + file)
    line_list = input_file.readlines()
    # print(line_list)
    indx = 0
    for line in line_list:
        indx = indx + 1
        print ("line", indx, "text: ", line)
        line_read(line)

0 个答案:

没有答案