我是Python新手,想从pdf文件中提取所有数据,并希望使用任何颜色突出显示拼写错误的单词,并且需要另存为.pdf或.Doc
我已经完成了一些步骤。
from nltk import word_tokenize
import enchant
import re
import PyPDF2
# open the pdf file
object = PyPDF2.PdfFileReader('E:\\PyProject\\PDF\\test1.pdf', 'rb')
# get number of pages
NumPages = object.getNumPages()
# read all the pages
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
# check misspelled words
d = enchant.Dict("en_US")
non_dict_words = list(set([word.encode('ascii', 'ignore') for word in word_tokenize(Text) if d.check(word) is False and re.match('^[a-zA-Z ]*$',word)] ))
print(non_dict_words)