Question

我正在尝试使用OCR和Python从一组图像中提取数据。该图像包含表格数据。我面临的问题是我无法正确读取图像的深灰色/黑色底色部分。我尝试了形态学，使用了伽玛和一些图像变换。但是他们似乎没有帮助。我该怎么做才能读取图像的这些部分？我正在使用python 3.7，pytesseract和OpenCV（CV2）我的示例图片看起来像这样-

这是我的代码-

import cv2
import os 
import numpy as np
import pytesseract
import PIL.Image 
import xlwt
from pdf2jpg import pdf2jpg
import glob 
from os import listdir
from os.path import isfile, join


input_path = r".pdf"  
output_path = r"..."
result = pdf2jpg.convert_pdf2jpg(input_path, output_path, pages= "ALL")
#print(result)


path = "...\\*"
empty_list = []
for x in glob.glob(path):
    empty_list.append(x)
print(empty_list)   #prints path of all images in the folder 


def change_path(image_path):
    print(type(path))
    new_path = path.replace("\\", "\\\\")
    print(new_path)
    return new_path

def  read_and_display(window_name, image_name):
    ori_img = cv2.imread(image_name)
    print(image_name)
    img = cv2.imshow(window_name, ori_img)
    cv2.waitKey(0)
    return img

def resize_the_image(image_name):           #def resize_the_image(window_name, image_name):
    image_name = cv2.imread(image_name)
    resized = cv2.resize(image_name,(1500,1300),interpolation=cv2.INTER_AREA)    
    return resized

def gray_the_image(image_name):     #def gray_the_image(window_name, image_name):
    image_name = cv2.imread(image_name)
    grayedimg = cv2.cvtColor(image_name, cv2.COLOR_BGR2GRAY)
    grayedimg = cv2.threshold(grayedimg, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    return grayedimg

def dilate_the_image(image_name):     #def dilate_the_image(window_name, image_name):
    image_name = cv2.imread(image_name)
    dilated_img = cv2.dilate(image_name, np.ones((7,7), np.uint8)) 
    return dilated_img

def bg_the_image(image_name):                    #def bg_the_image(window_name, image_name):
    image_name = cv2.imread(image_name)
    bg_img = cv2.medianBlur(image_name, 21)
    return bg_img


def difference_the_image(image_name):
    image_name = cv2.imread(image_name)
    bg_img = cv2.medianBlur(image_name, 21)
    diff_img = 255 - cv2.absdiff(image_name, bg_img)
    return diff_img


def normalize_the_image(image_name):   
    image_name = cv2.imread(image_name)
    bg_img = cv2.medianBlur(image_name, 21)
    diff_img = 255 - cv2.absdiff(image_name, bg_img)
    norm_img = diff_img.copy() # Needed for 3.x compatibility
    norm_img = cv2.normalize(diff_img, norm_img, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    return norm_img


def threshold_the_image(image_name):
    image_name = cv2.imread(image_name)
    bg_img = cv2.medianBlur(image_name, 21)
    diff_img = 255 - cv2.absdiff(image_name, bg_img)
    norm_img = diff_img.copy() # Needed for 3.x compatibility
    norm_img = cv2.normalize(diff_img, norm_img, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    _, threshold_image = cv2.threshold(norm_img, 230, 0, cv2.THRESH_TRUNC)
    threshold_image = cv2.normalize(threshold_image, threshold_image, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    return threshold_image


def adjust_gamma(image_name, gamma=1.0):

   invGamma = 1.0 / gamma
   table = np.array([((i / 255.0) ** invGamma) * 255
      for i in np.arange(0, 256)]).astype("uint8")
   return cv2.LUT(image_name, table)

def adjust_the_image(image_name):
    gamma = 0.5                                  # change the value here to get different result
    image_name = cv2.imread(image_name)
    bg_img = cv2.medianBlur(image_name, 21)
    diff_img = 255 - cv2.absdiff(image_name, bg_img)
    norm_img = diff_img.copy() # Needed for 3.x compatibility
    norm_img = cv2.normalize(diff_img, norm_img, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    _, threshold_image = cv2.threshold(norm_img, 230, 0, cv2.THRESH_TRUNC)
    threshold_image = cv2.normalize(threshold_image, threshold_image, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    adjusted_image = adjust_gamma(threshold_image, gamma=gamma)
    adjusted_image = cv2.putText(adjusted_image, "g={}".format(gamma), (10, 30),cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 3)
    return adjusted_image


def tesseract_call(image_path):
#    image_name = cv2.imread(image_path)
    cv2.imread(image_path)
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
    #config = ('-1 eng') 
    config = ('-psm 6')
    text = pytesseract.image_to_string(PIL.Image.open(image_path), config = config)
#    print(text)
    file = open("f.txt","a+", encoding  = "utf-8")
    file.write(text) 
    file.close() 
    return text



for i in empty_list:
    redefinedpath = (change_path(i))
    print(redefinedpath)
    read_and_display("newWindow", i)
    resize_the_image(i)
    gray_the_image(i)
    dilate_the_image(i)
    bg_the_image(i)
    difference_the_image(i)
    normalize_the_image(i)
    threshold_the_image(i)
    adjust_the_image(i)
    tesseract_call(i)

尝试了各种“ gamma”值，无济于事。

无法正确读取Python中具有深灰色/黑色背景的文本

0 个答案: