Question

我一直在使用pyocr（tesseract-ocr和libetesseract）测试图片中的文字识别功能。我已经应用了各种PIL.ImageFilter并在图像中获取了一个特定字符串的结果。它不准确，但我有14个不同的结果。在它们之间，图像中字符串的所有正确字母都在那里。所以我列举了每个字符串并创建了一个dict，其中包含字符＆＃39;作为键的位置，其中包含在键位置出现的每个字符的dict以及出现的次数作为值。这是一个缩短的例子

String In Image：

2HG2

结果：

#Note: this is not the actual order in which the strings are produced
2HC2
2HC2
2HCZ
2HOZ
2HOZ
2HOZ
2HOZ
2HGZ
2HGZ
2HGZ
ZHGZ
ZHGZ
ZH6Z
ZN6z

字典：

{
    0: {
        u'2': 10, 
        u'Z': 4
    }, 1: {
        u'H': 13, 
        u'N': 1
    }, 2: {
        u'C': 3, 
        u'O': 4, 
        u'G': 5, 
        u'6': 2
    }, 3: {
        u'2': 2, 
        u'Z': 11, 
        u'z': 1
    }
}

我想尝试每个位置的每个字母组合，直到我得到2HG2。任何帮助将不胜感激。

修改我试图实现的目标是扫描汽车注册，从中获取文本，然后使用数据填充表单。作为概念证明，我试图从我的人员注册中获取VIN号码。目前，我（很可能天真地）应用一系列PIL.ImageFilter并从每个import re from itertools import permutations from PIL import Image, ImageFilter import pyocr from pyocr import builders vins = [] characters = {} def validate(vincode): """ Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number """ maps = "0123456789X" weights = [ 8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2 ] table = { "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9, "S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9, } if not isinstance(vincode, str) and not isinstance(vincode, unicode): return False if len(vincode) != 17: return False vincode = vincode.upper() if "I" in vincode or "O" in vincode or "Q" in vincode: return False total = 0 for index, value in enumerate(vincode): try: products = table[value] * weights[index] except KeyError: break total += products index = total % 11 return maps[index] == vincode[8] def get_text(tools_, img_): for tool in tools_: if tool.get_name() == 'Cuneiform (sh)': continue # print '=======================\nUsing {}\n======================='.format(tool.get_name()) boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder()) global vins pattern = re.compile('[\W_]+') vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17] # boxes = [x for x in boxes if len(x.content.strip()) != 0] # print boxes[3].content # for box in boxes: # print box.content def apply_filters_and_get_text(img_, filter_): for x in range(1, 5): print 'Applying {} size: {}'.format(str(filter_), x) try: img_ = img_.filter(filter_(x)) except ValueError: print 'error on {} size: {}'.format(str(filter_), x) continue img_.save('tmp{}-{}.jpg'.format(str(filter_), x)) get_text(tools, img_) def count_occurrences(value): global characters for index, c in enumerate(value): if index in characters and c in characters[index]: characters[index][c] += 1 continue if index in characters and isinstance(characters[index], dict): characters[index][c] = 1 continue characters[index] = {c: 1} tools = pyocr.get_available_tools() img = Image.open('images/test18.jpg') # get_text(tools) # img = img.filter(ImageFilter.MaxFilter(5)) # img = img.filter(ImageFilter.SHARPEN) # img = img.filter(ImageFilter.SMOOTH_MORE) # get_text(tools) # get_text(tools) img = img.convert('L') # get_text(tools) # img = img.filter(ImageFilter.MaxFilter(5)) # img = img.filter(ImageFilter.SHARPEN) # img = img.filter(ImageFilter.SMOOTH_MORE) # get_text(tools) # get_text(tools) img = img.point(lambda x: 0 if x < 128 else 255, '1') apply_filters_and_get_text(img, ImageFilter.MedianFilter) apply_filters_and_get_text(img, ImageFilter.MinFilter) apply_filters_and_get_text(img, ImageFilter.MaxFilter) apply_filters_and_get_text(img, ImageFilter.ModeFilter) for vin in vins: count_occurrences(vin) # print vin # print validate(vin) print characters获取文本。以下是我的剧本。

{{1}}

Answer 1

我能够找出一个递归函数，它尝试每个字母组合优先考虑权重较高的字符。

def determine_character(characters_, tried=[]):
    next_character = ""
    current_rank = 0
    for ch in characters_:
        if characters_[ch] > current_rank and ch not in tried:
            next_character = ch
    return next_character


def determine_weight(word):
    global characters
    weight = 0
    for index, ch in enumerate(word):
        weight += characters[index][ch]
    return weight


def descramble(word="", index=0):
    global characters
    count = len(characters)
    if index == count and validate(word):
        global vin_count, valid_vins
        vin_count += 1
        valid_vins.append({'vin': word, 'weight': determine_weight(word)})
        return {'word': word, 'done': True}
    if index == count:
        return False
    tried = []
    while len(tried) < len(characters[index]):
        ch = determine_character(characters[index], tried)
        tried.append(ch)
        next_index = index + 1
        descramble("{word}{ch}".format(word=word, ch=ch), next_index)

尝试tesseract文本结果中每个加权组合的字母

String In Image：

结果：

1 个答案: