Question

我有一个将pdf转换为excel的程序，现在我想添加多个输入，即多个pdf一张一张地转换。

我的代码如下：

from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import os
import cv2
import pandas as pd
import re
import numpy as np
import os

pdf = wi(filename= "pdfs/jaalna.pdf", resolution =300)
pdfImage = pdf.convert("jpg")

imageBlobs = []
for img in pdfImage.sequence:
    imgPage = wi(image = img)
    #img.filter(ImageFilter.EDGE_ENHANCE_MORE )
    imageBlobs.append(imgPage.make_blob('jpg'))
    recognized_text = []

for imgBlob in imageBlobs:
     im = Image.open(io.BytesIO(imgBlob))
     text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
     recognized_text.append(text)

newfile = open('aama.txt','w')
newfile.write(",".join(recognized_text))

#add a folder as input.

Answer 1

您可以使用循环

for name in ["pdfs/jaalna.pdf", "other/file.pdf"]:
    pdf = wi(filename=name, resolution=300)
    # rest of code

或者您可以使用sys.argv来获取名称

script.py pdfs/jaalna.pdf other/file.pdf other/third.pdf

和代码

import sys

for name in sys.argv[1:]:
    pdf = wi(filename=name, resolution=300)
    # rest of code

Answer 2

尝试下面的代码。这将遍历您定义的文件夹目录中的每个PDF文件。请确保将file_path更新为保存PDF的位置，并确保使用双反斜杠代替单个反斜杠。

from PIL import Image
import io
import pytesseract
from wand.image import Image as wi
import cv2
import pandas as pd
import re
import numpy as np
import os

file_path = "C:\\Users\\..."

for file in os.listdir(file_path):
    if file.endswith(".pdf"):
        pdf = wi(file, resolution =300)
        pdfImage = pdf.convert("jpg")

        imageBlobs = []
        for img in pdfImage.sequence:
            imgPage = wi(image = img)
            #img.filter(ImageFilter.EDGE_ENHANCE_MORE )
            imageBlobs.append(imgPage.make_blob('jpg'))
            recognized_text = []

        for imgBlob in imageBlobs:
             im = Image.open(io.BytesIO(imgBlob))
             text = pytesseract.image_to_string(im, lang = 'eng1+mar1')
             recognized_text.append(text)

        newfile = open(file+'.txt','w')
        newfile.write(",".join(recognized_text))

        #add a folder as input.

如何添加多个PDF转换为Excel？

2 个答案: