我正在尝试构建一个将链接作为参数的api,如下所示
https://ptenantectdtest.blob.core.windows.net/documentcontainer/2e99e137-4958-426b-8f33-6246de7ec721?sv=2015-12-11&sr=b&sig=2kVS0XS56K18co1LDpt703O9d%2FZzTywFGgPTdzlz28c%3D&st=2019-07-17T11%3A29%3A25Z&se=2019-07-17T17%3A39%3A25Z&sp=r&rscd=attachment%3B%20filename%3D%20%228d4508bf-453e-45fd-8457-8fd158152ba7.pdf%22
下载文件并将文件名发送到另一个对文件进行分类的函数(转换)。因此,我的api应该能够将此可下载链接作为参数并处理文件。
我到目前为止编写的api是:
import ectd
from ectd import convert
from flask import Flask, request
from flask_restful import Resource, Api
import os
import requests
app = Flask(__name__)
api = Api(app)
class ectdtext(Resource):
def get(self, result):
return {'data': ectd.convert(result)}
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def get_dir(path):
r = requests.get(path)
with open('D:/file_namexx.pdf', 'wb') as f:
f.write(r.content)
categories = convert('D:/file_namexx.pdf')
return categories
if __name__ == '__main__':
app.run()
这给我一个pdf语法错误
我使用convert函数的ectd文件如下:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
def convert(fname, pages=None,encoding='utf-8'):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
if len(text)>=500:
regex3=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Process validation|Justification of Specification(s)|Post Approval Stability Protocol and Stability Commitment)",text,re.IGNORECASE)
if regex3:
return regex3.group(1)
else:
pdffile = wi(filename = fname, resolution = 300)
pdfImg = pdffile.convert('jpeg')
imgBlobs = []
for img in pdfImg.sequence:
page = wi(image = img)
imgBlobs.append(page.make_blob('jpeg'))
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# extracted_text = []
for imgBlob in imgBlobs:
im= Image.open(io.BytesIO(imgBlob))
text2 = pytesseract.image_to_string(im, lang = 'eng')
regex4=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Manufacturer(s)|Justification of Specification(s)|Process Validation|Batch Formula|Description of Manufacturing Process and Process Controls|Controls of Critical Steps and Intermediates|Process Validation and/or Evaluation|Specification(s))",text2,re.IGNORECASE)
if regex4:
return regex4.group(1)
关于我做错了什么以及我应该在api路由中解决的任何建议。问题是我认为是从保存的pdf中读取的。如何解决此pdf语法错误。是因为模式还是编码类型