我正在尝试使用pdfminer从pdf获取文本数据。我可以使用pdfminer命令行工具pdf2txt.py成功将此数据提取到.txt文件。我目前这样做,然后使用python脚本来清理.txt文件。我想将pdf提取过程合并到脚本中并为自己节省一步。
I thought I was on to something when I found this link,但我没有成功解决任何问题。也许那里列出的功能需要再次更新,因为我使用的是更新版本的pdfminer。
I also tried the function shown here, but it also did not work.
我尝试的另一种方法是使用os.system
在脚本中调用脚本。这也是不成功的。
我使用的是Python 2.7.1版和pdfminer版20110227。
答案 0 :(得分:75)
这是一个适用于最新版本的新解决方案:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
答案 1 :(得分:66)
这是我最终制作的清理版本,对我有用。以下只是简单地返回PDF中的字符串,给定其文件名。我希望这可以节省一些时间。
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
此解决方案在API changes in November 2013之前有效。
答案 2 :(得分:12)
我知道回答你自己的问题很不好,但我想我可能已经想到了这一点,我不想让其他人浪费时间寻找解决我问题的方法。
我在我的问题中发布的一个链接中遵循了该建议,并重新使用了pdfminer附带的当前pdf2txt.py脚本。这是函数,以防它对其他人有用。感谢用户skyl发布该答案,我所要做的就是进行一些更改,使其与当前版本的pdfminer一起使用。
此函数接受pdf并在同一目录中创建一个.txt文件,名称相同。
def convert_pdf(path, outtype='txt', opts={}):
import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
import getopt
outfile = path[:-3] + outtype
outdir = '/'.join(path.split('/')[:-1])
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
# ?outfile = None
# ?outtype = None
outdir = None
#layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
#PDFDocument.debug = debug
#PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
outtype = 'text'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True)
fp.close()
device.close()
outfp.close()
return
答案 3 :(得分:10)
这对我使用最新版本的pdfminer(截至2014年9月):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import unicodedata, codecs
from io import StringIO
def getPDFText(pdfFilenamePath):
retstr = StringIO()
parser = PDFParser(open(pdfFilenamePath,'r'))
try:
document = PDFDocument(parser)
except Exception as e:
print(pdfFilenamePath,'is not a readable pdf')
return ''
if document.is_extractable:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr,retstr, codec='ascii' , laparams = LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
return retstr.getvalue()
else:
print(pdfFilenamePath,"Warning: could not extract text from pdf file.")
return ''
if __name__ == '__main__':
words = getPDFText(path)
答案 4 :(得分:6)
这是我的解决方案
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
def convert_pdf_to_txt(path, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(path, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
例如,您只想阅读pdf文件的前3页:
text = convert('../Data/EN-FINAL Table 9.pdf', pages=[0,1,2])
pdfminer.six == 20160614
python:3.x
答案 5 :(得分:4)
以下对非process_pdf答案的修改直接从URL字符串名称中提取文本,并使用版本20140328和Python 2.7:
from urllib2 import urlopen
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(url):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
scrape = urlopen(url).read()
fp = StringIO(scrape)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr
答案 6 :(得分:3)
如果您正在通过urllib2处理已删除的数据,请尝试此操作(已开发并解释here):
def pdf_to_text(scraped_pdf_data):
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import StringIO
fp = StringIO.StringIO()
fp.write(scraped_pdf_data)
fp.seek(0)
outfp = StringIO.StringIO()
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
process_pdf(rsrcmgr, device, fp)
device.close()
t = outfp.getvalue()
outfp.close()
fp.close()
return t
与其他答案一样,此处的代码会调整PDFMiner本身提供的pdf2txt实用程序。因此,您也可以转换为html或xml - 只需要HTMLConverter
的{{1}}或XMLConverter
以上的TextConverter
。
答案 7 :(得分:3)
以下代码适用于最新版本的PDFMiner,它采用pdf路径并以.txt格式返回文本。
P.S:这是对上述答案的修改。
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path, outtype='txt'):
outfile = path[:-3] + outtype
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
答案 8 :(得分:3)
万一有人还需要这个, 得到它与请求和python 3.4。 感谢@bahmait的回答:)
import requests
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
def pdf_to_text(url=None):
text = None
pdf = requests.get(url)
if pdf.ok:
fp = StringIO(str(pdf.content, 'utf-8'))
outfp = StringIO()
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
process_pdf(rsrcmgr, device, fp)
device.close()
text = outfp.getvalue()
outfp.close()
fp.close()
return text
if __name__ == "__main__":
hello_world_text = pdf_to_text("https://bytebucket.org/hsoft/pdfminer3k/raw/28edfc91caed830674ca0b928f42571f7dee6091/samples/simple1.pdf")
no_pdf = pdf_to_text('http://www.google.com/404')
print(hello_world_text)
print(no_pdf)
答案 9 :(得分:2)
这是我最终制作的清理版本,对我有用。以下只是简单地返回PDF中的字符串,给定其文件名。我希望这可以节省一些时间。
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
任何人都可以说我:是否有任何特定的地方可以放置pdf文件?
答案 10 :(得分:2)
这是一个适用于pdfminer.six
运行python 3.6的答案。如果您只是想从简单的PDF文件中获取原始文本,它会使用pdfminer.high_level
模块来抽象出许多底层细节。
import pdfminer
import io
def extract_raw_text(pdf_filename):
output = io.StringIO()
laparams = pdfminer.layout.LAParams() # Using the defaults seems to work fine
with open(pdf_filename, "rb") as pdffile:
pdfminer.high_level.extract_text_to_fp(pdffile, output, laparams=laparams)
return output.getvalue()
答案 11 :(得分:1)
只有当有人仍然需要它时:如何使用PDFMiner从PDF打印HTML:
import sys
import getopt
from Core.Interfaces.IReader import IReader
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from cStringIO import StringIO
class PdfReader(object):
def __init__(self):
pass
def readText(self,path, outtype='text', opts={}):
outfile = path[:-3] + outtype
outdir = '/'.join(path.split('/')[:-1])
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
# ?outfile = None
# ?outtype = None
outdir = None
#layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
print laparams
#
#PDFDocument.debug = debug
#PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
#outtype = 'text'
outfp = StringIO()
device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True)
fp.close()
device.close()
print outfp.getvalue()
outfp.close()
return
reader = PdfReader()
opt = map(None,['-W','-L','-t'],[0.5,0.4,'html'])
reader.readText("/test_data/test.pdf","html",opt)
答案 12 :(得分:0)
以下代码段能够使用最新版本的pdfminer从PDF文档中提取纯文本(截至2016年3月23日)。希望这会有所帮助。
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
print text
return text
convert_pdf_to_txt(<path_of_the_pdf_file>)
答案 13 :(得分:0)
这个人在python 3中为我工作。 它需要PDFMiner.six包
say "are you sure you want to quit all applications?"
set white_list to {""}
set doneWhitelist to ""
repeat until doneWhitelist = "Done"
set whiteListedApps to display dialog "WhiteList" buttons {"Add More", "Done"} default answer "Finder"
set whiteListedAppNames to text returned of whiteListedApps
set doneWhitelist to button returned of whiteListedApps
set white_list to white_list & whiteListedAppNames
end repeat
set orginizedList to item 1 of white_list
---------------------------------------------------------------------------------------------------------------------------
repeat (length of white_list) times
set i to 2
set orginizedList to orginizedList & item i of white_list & ", "
set i to i + 1
end repeat
---------------------------------------------------------------------------------------------------------------------------
set confirmQuit to display alert "Are you sure you want to quit all applications except for " & orginizedList & "?" buttons {"Yes", "No"}
set confirmQuit to button returned of confirmQuit
if confirmQuit = "No" then
error number -128
else
tell application "System Events" to set the visible of every process to true
try
tell application "Finder"
set process_list to the name of every application whose visible is true
end tell
repeat with i from 1 to (number of items in process_list)
set this_process to item i of the process_list
if this_process is not in white_list then
tell application this_process
quit
end tell
end if
end repeat
on error
tell the current application to display dialog "An error has occurred!" & return & "This script will now quit" buttons {"Quit"} default button 1 with icon 0
end try
end if
代码如下(与每个人相同的代码,但有较小的修复):
pip install pdfminer.six
答案 14 :(得分:0)
完全公开,我是 pdfminer.six 的维护者之一。
如今,有多种 api 可以根据您的需要从 PDF 中提取文本。在幕后,所有这些 api 都使用相同的逻辑来解析和分析布局。
(所有示例都假设您的 PDF 文件名为 example.pdf)
命令行
如果你只想提取一次文本,可以使用命令行工具pdf2txt.py:
$ pdf2txt.py example.pdf
高级 API
如果你想用Python提取文本,可以使用高级api。如果您想以编程方式从许多 PDF 中提取文本,则此方法是首选解决方案。
from pdfminer.high_level import extract_text
text = extract_text('example.pdf')
可组合的 api
还有一个可组合的 api,它在处理结果对象方面提供了很大的灵活性。例如,您可以使用它来实现自己的布局算法。其他答案中建议使用此方法,但我仅在您需要自定义 pdfminer.six 的行为方式时才推荐此方法。
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('example.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())