试图通过使用Wand ImageMagic来提高图像质量,并使用Pytesseract OCR获得准确的输出
My Code TO convert Scanned PDF to Text File
import sys
import io
from pdf2image import convert_from_path
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
import re
import os
directory = r"C:/Users/ahmed.nizamuddin/Desktop/PDF/"
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
print(filename)
pages = convert_from_path( str(filename), 960)
content = ""
for page in pages:
page.save("%s-page-%d.jpg" % (filename.split('.pdf')[0],pages.index(page)), "JPEG")
image_name = "%s-page-%d.jpg" % (filename.split('.pdf')[0],pages.index(page))
text_name = "%s.txt" % (filename.split('.pdf')[0])
image_to_convert = image_name
text_to_save = r"C:/Users/ahmed.nizamuddin/Desktop/PDF/Text/" + text_name
print(image_to_convert)
#print (image_to_convert)
content += pytesseract.image_to_string(Image.open(image_to_convert))
#list.append(content)
#page.delete("%s-page-%d.jpg" % (filename.split('.pdf')[0],pages.index(page)
#print(content)
os.remove(image_to_convert)
with open(text_to_save,'a') as out:
out.write(content)
现在我想使用魔杖包装,以便在Pytesseract OCR之前可以提高图像质量。有人可以告诉我怎么做吗?