我一直在使用pyocr
(tesseract-ocr
和libetesseract
)测试图片中的文字识别功能。我已经应用了各种PIL.ImageFilter
并在图像中获取了一个特定字符串的结果。它不准确,但我有14个不同的结果。在它们之间,图像中字符串的所有正确字母都在那里。所以我列举了每个字符串并创建了一个dict
,其中包含字符'作为键的位置,其中包含在键位置出现的每个字符的dict
以及出现的次数作为值。这是一个缩短的例子
2HG2
#Note: this is not the actual order in which the strings are produced
2HC2
2HC2
2HCZ
2HOZ
2HOZ
2HOZ
2HOZ
2HGZ
2HGZ
2HGZ
ZHGZ
ZHGZ
ZH6Z
ZN6z
字典:
{
0: {
u'2': 10,
u'Z': 4
}, 1: {
u'H': 13,
u'N': 1
}, 2: {
u'C': 3,
u'O': 4,
u'G': 5,
u'6': 2
}, 3: {
u'2': 2,
u'Z': 11,
u'z': 1
}
}
我想尝试每个位置的每个字母组合,直到我得到2HG2
。任何帮助将不胜感激。
修改
我试图实现的目标是扫描汽车注册,从中获取文本,然后使用数据填充表单。作为概念证明,我试图从我的人员注册中获取VIN号码。目前,我(很可能天真地)应用一系列PIL.ImageFilter
并从每个import re
from itertools import permutations
from PIL import Image, ImageFilter
import pyocr
from pyocr import builders
vins = []
characters = {}
def validate(vincode):
"""
Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
"""
maps = "0123456789X"
weights = [
8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
]
table = {
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
"J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
"S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
}
if not isinstance(vincode, str) and not isinstance(vincode, unicode):
return False
if len(vincode) != 17:
return False
vincode = vincode.upper()
if "I" in vincode or "O" in vincode or "Q" in vincode:
return False
total = 0
for index, value in enumerate(vincode):
try:
products = table[value] * weights[index]
except KeyError:
break
total += products
index = total % 11
return maps[index] == vincode[8]
def get_text(tools_, img_):
for tool in tools_:
if tool.get_name() == 'Cuneiform (sh)':
continue
# print '=======================\nUsing {}\n======================='.format(tool.get_name())
boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
global vins
pattern = re.compile('[\W_]+')
vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
# boxes = [x for x in boxes if len(x.content.strip()) != 0]
# print boxes[3].content
# for box in boxes:
# print box.content
def apply_filters_and_get_text(img_, filter_):
for x in range(1, 5):
print 'Applying {} size: {}'.format(str(filter_), x)
try:
img_ = img_.filter(filter_(x))
except ValueError:
print 'error on {} size: {}'.format(str(filter_), x)
continue
img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
get_text(tools, img_)
def count_occurrences(value):
global characters
for index, c in enumerate(value):
if index in characters and c in characters[index]:
characters[index][c] += 1
continue
if index in characters and isinstance(characters[index], dict):
characters[index][c] = 1
continue
characters[index] = {c: 1}
tools = pyocr.get_available_tools()
img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)
for vin in vins:
count_occurrences(vin)
# print vin
# print validate(vin)
print characters
获取文本。以下是我的剧本。
{{1}}
答案 0 :(得分:0)
我能够找出一个递归函数,它尝试每个字母组合优先考虑权重较高的字符。
def determine_character(characters_, tried=[]):
next_character = ""
current_rank = 0
for ch in characters_:
if characters_[ch] > current_rank and ch not in tried:
next_character = ch
return next_character
def determine_weight(word):
global characters
weight = 0
for index, ch in enumerate(word):
weight += characters[index][ch]
return weight
def descramble(word="", index=0):
global characters
count = len(characters)
if index == count and validate(word):
global vin_count, valid_vins
vin_count += 1
valid_vins.append({'vin': word, 'weight': determine_weight(word)})
return {'word': word, 'done': True}
if index == count:
return False
tried = []
while len(tried) < len(characters[index]):
ch = determine_character(characters[index], tried)
tried.append(ch)
next_index = index + 1
descramble("{word}{ch}".format(word=word, ch=ch), next_index)