我有一个PDF日期,其中包含以下日期页面。我需要使用python单独使用以下08/30/1941页面。寻找逻辑?
{'05/20/2016': [1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6],
'06/20/2016': [4],
'08/30/1941': [1, 5]}
答案 0 :(得分:0)
因此,您需要做的是使用PyPDF2或PDFMiner从PDF中提取文本
<强> PDFMiner 强>
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for pageNumber, pages in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
interpreter.process_page(pages)
#for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
#interpreter.process_page(0)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
words = text.split('\n')
return words
使用上述功能提取文字
a_list = convert_pdf_to_txt(path)
这将返回一个列表,您基本上可以浏览列表中的元素
导入重新
try:
for index, values in enumerate(a_list):
date_to_find = re.compile(r"08/30/1941")
Date_to_find = filter(date_to_find.match, values))
if Date_to_find == []:
print("The Date is not available")
else:
found = a_list[index]
Date_to_find = re.match(r"(08/30/1941:)(.*\Z)")
Date_Found = Date_to_find.group(2)
print(Date_Found)
except:
print("There was some error")