我想通过PyPDF2将pdf文件转换为文本,但是转换后的文本看起来与PDF文件有所不同。具体来说,PDF中的一行被分成文本中的多行,单词也可能被断开。随附的是PDF和文本文件,以及下面的代码。谁能帮我解决这个问题?
enter code here
import PyPDF2
def extractPdfText(filePath=''):
# Open the pdf file in read binary mode.
fileObject = open(filePath, 'rb') # rb
# Create a pdf reader .
pdfFileReader = PyPDF2.PdfFileReader(fileObject)
# Get total pdf page number.
totalPageNumber = pdfFileReader.numPages
# Print pdf total page number.
print('This pdf file contains totally ' + str(totalPageNumber) + ' pages.')
currentPageNumber = 0
text = ''
# Loop in all the pdf pages.
while(currentPageNumber < totalPageNumber ):
# Get the specified pdf page object.
pdfPage = pdfFileReader.getPage(currentPageNumber)
# Get pdf page text.
text = text + pdfPage.extractText()
# Process next page.
currentPageNumber += 1
return text
pdfFilePath = 'PDF file path'
pdfText = extractPdfText(pdfFilePath)
答案 0 :(得分:0)
此答案使用 encode('utf-8')将每页的输出保持在一起。我不知道您需要什么输出,因为您的问题中没有指定输出。
from PyPDF2 import PdfFileReader
def pdf_text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# Get total pdf page number.
totalPageNumber = pdf.numPages
currentPageNumber = 0
while (currentPageNumber < totalPageNumber):
page = pdf.getPage(currentPageNumber)
text = page.extractText()
# The encoding put each page on a single line.
# type is <class 'bytes'>
print(text.encode('utf-8'))
#################################
# This outputs the text to a list,
# but it doesn't keep paragraphs
# together
#################################
# output = text.encode('utf-8')
# split = str(output, 'utf-8').split('\n')
# print (split)
#################################
# Process next page.
currentPageNumber += 1
path = 'mypdf.pdf'
pdf_text_extractor(path)
PyPDF2和 extractText()函数的文档指出:
extractText()
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF files, but
poorly for others, depending on the generator used. This will be refined in
the future. Do not rely on the order of text coming out of this function, as
it will change if this function is made more sophisticated.
Returns: a unicode string object.
这意味着提取与PDF中格式化文本完全相同的文本可能会出现问题。
您可以使用tika来完成此任务,但是它不会完全干净。
from tika import parser
parse_entire_pdf = parser.from_file('mypdf.pdf', xmlContent=True)
parse_entire_pdf = parse_entire_pdf['content']
print (parse_entire_pdf)
真正的问题是-您打算如何使用提取的文本?
答案 1 :(得分:0)
这就是我要做的。
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)