使用pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
使用PyPDF2
import PyPDF2
pdfFileObj = open('85-0430EBallastWater.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
out = pdfReader.getPage(0)
pdfReader.numPages
pageObj = pdfReader.getPage(0) #'9' is the page number
output = pageObj.extractText()
但即使在尝试使用两个API后,我得到的是pdf编码值,但不是空框中的值,这是pdf的可编辑部分?
示例: - 船名:salil
我能够阅读' Vessel名称:'但我没有得到salil,而是得到了''空字符串。关键是能够阅读salil。
下面是我要解析的pdf,请帮忙。