我正在尝试将PDF转换为Python中的文本。但这给了我一个错误:
PDFTextExtractionNotAllowed:不允许文本提取:<_io.BufferedReader name ='C:\ Users \ Downloads \ Facts_for_2017.pdf'>
我使用的代码是:
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
def pdfparser(data):
fp = open(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
return data
if __name__ == '__main__':
text = pdfparser(Input_path)
有人可以帮助我吗?
文件路径为:
https://drive.google.com/file/d/1RyR-J-EwMywL6BqsYbl4Ocm96VzCYrM7/view?usp=sharing
答案 0 :(得分:0)
您收到错误消息是因为缩进在data = retstr.getvalue()
行中是错误的,它应该在for循环之外。
但是,在解决此问题之后,我遇到了其他一些问题,因此我在下面提供了完整的代码:
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
def pdfparser(data):
fp = open(data, 'rb')
rsrcmgr = PDFResourceManager()
# retstr = io.StringIO() #This will cause -- `TypeError: unicode argument expected, got 'str'`
retstr = io.BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue() #Indentation was worng here
fp.close()
#print(data)
return data
if __name__ == '__main__':
#PDF file you provied is encrypted with blank password, we need to decrypt it
path = sys.argv[1]
from subprocess import call
import os
pdf_filename = os.path.basename(path)
file_name, extension = os.path.splitext(pdf_filename)
pdf_filename_decr = str(file_name) + "_decr" + extension
call('qpdf --password=%s --decrypt %s %s' %('', path, pdf_filename_decr), shell=True)
text = pdfparser(pdf_filename_decr)
答案 1 :(得分:0)
问题是PDFPage.get_pages()
检查文本是否可以按照约定提取。您必须将标志设置为check_extractable=False
才能起作用。另外,如果您尝试转换为txt的PDF受密码保护,则也可以在其中进行更改。不幸的是,PDFPage
的{{3}}不太清楚。
password = ""
for page in PDFPage.get_pages(fp, check_extractable=False, password=password):
interpreter.process_page(page)
data = retstr.getvalue()
您的整个代码如下所示:
import io
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
def pdfparser(data):
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
with open(data, 'rb') as fp:
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=False):
interpreter.process_page(page)
# As pointed out in another answer, this goes outside the loop
text = retstr.getvalue()
device.close()
retstr.close()
return text
注意:Python的with open ...:
模式实现对于正确处理文件对象很有用。