使用Tesseract进行图像到文本转换

时间:2017-10-31 19:50:50

标签: python tesseract

我正在尝试加载文件夹中的所有图像并从图像中提取文本。我一直收到第二个for循环的错误消息。例如,

  

AttributeError:'numpy.ndarray'对象没有属性'read'

似乎我无法访问列表 Img 。有什么想法吗?

# import OpenCV, Numpy, Python image library, Tesseract OCR
import os
import cv2
import numpy 
from PIL import Image
import pytesseract
import glob

#set tesseract path
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'

#read all image with .jpg format in a specifying folder
img = []    

for i in glob.glob("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\\Reports\\Image\\*.jpg"):
    n= cv2.imread(i,0)   #convert image to grayscale    
    print(i)
    img.append(n)


for j in img:
    im = Image.open(j)
    text = pytesseract.image_to_string (j, lang='eng')
    with open("C:\\Users\\daizhang\\Desktop\\Deloitte Development\\Python\Reports\\Image\\test.txt", "w") as f:
    f.write(text.encode('utf8'))

1 个答案:

答案 0 :(得分:1)

我有Mac OSX,但您可以调整此代码以存档Window的路径目录。

import os
from os import path
from glob import glob 
from pytesseract import image_to_string
from PIL import Image, ImageEnhance, ImageFilter

def enhance_img(filename):
    # Enhance image and save as under new name
    im = im.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(2)
    im = im.convert('1')
    im.save('newfilename')

def convert_img(filename):
    image = Image.open(filename)

    # Convert image to text
    file = open ('parsing.txt', 'a')
    file.write(image_to_string(image))
    file.close

def find_ext(dir, ext):
    return glob(path.join(dir, "*.{}".format(ext)))

# use the following for change directory
    # os.chdir(path)
filename = find_ext("","png")

for file in filename:
    # convert image to text
    convert_img(file)

如果要增强图像,请包含以下块并调整上面的代码以循环显示新的文件名。

def enhance_img(filename):
    # Enhance image and save as under new name
    im = im.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(im)
    im = enhancer.enhance(2)
    im = im.convert('1')
    im.save('newfilename')

For file in filename:
    # to enhance image if needed 
    newfilename = filename[-3] + '_1.png'
    enhance_img(file)