我有一个链接,在进入浏览器时,它会下载文件。我已经编写了一个代码块以下载并保存为pdf文件。代码如下: names.py
import requests
def download_url(url):
r = requests.get(url)
with open('D:/file_name.pdf', 'wb') as f:
f.write(r.content)
s = download_url('https://ptenantectdtest.blob.core.windows.net/documentcontainer/74c5543d-7ec6-490c-8170-c3355b67f132?sv=2015-12-11&sr=b&sig=BC4RuRGrHkIkxSTnU3g9DTnxr9CfyohtUcV6rOATobo%3D&st=2019-07-17T09%3A04%3A25Z&se=2019-07-17T15%3A14%3A25Z&sp=r&rscd=attachment%3B%20filename%3D%20%223251488b-58b7-44f1-92f4-53ede0891e9a.pdf%22')
print(len(s.content))
现在,我想将此保存的文件('file_name.pdf')传递给另一个函数:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def convert(fname, pages=None,encoding='utf-8'):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
if len(text)>=500:
regex3=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Process validation|Justification of Specification(s)|Manufacturer(s)|Batch Formula|Description of Manufacturing Process and Process Controls|Controls of Critical Steps and Intermediates|Process Validation and/or Evaluation|Specification(s)|Analytical Procedures|Validation of Analytical Procedures|Batch Analyses|Characterization of Impurities|Reference Standards or Materials|Container Closure System|Pharmaceutical Development|Description and Composition of the Drug Productal Stability Protocol and Stability Commitment)",text,re.IGNORECASE)
if regex3:
return regex3.group(1)
else:
pdffile = wi(filename = fname, resolution = 300)
pdfImg = pdffile.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# extracted_text = []
for imgBlob in imgBlobs:
im= Image.open(io.BytesIO(imgBlob))
text2 = pytesseract.image_to_string(im, lang = 'eng')
regex4=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Manufacturer(s)|Justification of Specification(s)|Process Validation|Batch Formula|Description of Manufacturing Process and Process Controls|Container Closure System|Pharmaceutical Development|Description and Composition of the Drug Product|Quality overall summary|Nomenclature|Structure|General properties|Manufacturer|Description of Manufacturing Process and Process Controls|Stability Data|Control of Materials|Controls of Critical Steps and Intermediates|Process Validation and/or Ev",text2,re.IGNORECASE)
if regex4:
return regex4.group(1)
convert(r'D:\file_name.pdf')
如果我将此文件分别传递给convert函数,则可以正常工作,但是如何在我的第一个代码块(names.py)中使用convert函数,以便可以使用保存的文件(file_name.pdf)并传递它将其转换为convert函数以提供输出。如果尝试这样做,我将收到'PDFSyntaxError:否/ Root对象! -这真的是PDF吗?”
我尝试了如下所示的操作,它给出了PDFSyntaxError:
import requests
def download_url(url):
r = requests.get(url)
with open('D:/file_name.pdf', 'wb') as f:
f.write(r.content)
categories = convert('D:/file_name.pdf')
return categories
s = download_url('https://ptenantectdtest.blob.core.windows.net/documentcontainer/74c5543d-7ec6-490c-8170-c3355b67f132?sv=2015-12-11&sr=b&sig=BC4RuRGrHkIkxSTnU3g9DTnxr9CfyohtUcV6rOATobo%3D&st=2019-07-17T09%3A04%3A25Z&se=2019-07-17T15%3A14%3A25Z&sp=r&rscd=attachment%3B%20filename%3D%20%223251488b-58b7-44f1-92f4-53ede0891e9a.pdf%22')
print(len(s.content))
任何人都可以帮助我解决如何在上述代码块中调用convert方法以获取输出以及为什么出现此PDFsyntaxerror的问题。