我有一个将pdf转换为excel的程序,现在我想添加多个输入,即多个pdf一张一张地转换。
我的代码如下:
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os
pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.
答案 0 :(得分:0)
您可以使用循环
for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
pdf = wi(filename=name, resolution=300)
# rest of code
或者您可以使用sys.argv
来获取名称
script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf
和代码
import sys
for name in sys.argv[1:]:
pdf = wi(filename=name, resolution=300)
# rest of code
答案 1 :(得分:0)
尝试下面的代码。这将遍历您定义的文件夹目录中的每个PDF文件。请确保将file_path更新为保存PDF的位置,并确保使用双反斜杠代替单个反斜杠。
from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os
file_path = "C:\\Users\\..."
for file in os.listdir(file_path):
if file.endswith(".pdf"):
pdf = wi(file, resolution =300)
pdfImage = pdf.convert("jpg")
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image = img)
#img.filter(ImageFilter.EDGE_ENHANCE_MORE )
imageBlobs.append(imgPage.make_blob('jpg'))
recognized_text = []
for imgBlob in imageBlobs:
im = Image.open(io.BytesIO(imgBlob))
text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
recognized_text.append(text)
newfile = open(file+'.txt','w')
newfile.write(",".join(recognized_text))
#add a folder as input.