我是ML的新手,已经改编了https://github.com/DeepSystems/supervisely-tutorials/blob/master/anpr_ocr/src/image_ocr.ipynb中的代码以读取MICR文本。
我的网络总是收敛于完全不输出任何东西,或者只输出一个或两个字符。我希望它在我输入的图像上输出大约20个字符,近似于MICR文本。例外情况是,如果将其训练在一张图像上。在这种情况下,它会很接近正确的答案,但通常只会输出预期答案的一半。
我尝试过调整超参数和调整模型架构(添加/删除卷积层和池)。它们影响“学习”的速度,但仍然会导致收敛到什么都不输出。我可能丢失了一些明显的错误,但不知道它是什么。我也想知道它们的体系结构是否不适用于文本不在图像的前面和中间的图像。
如果您想自己运行代码,我在这里发布了500张训练图像: https://drive.google.com/open?id=1qqr64Oyh_ghzuKoqoDBzkx3zKkhIuFPv
import keras
import tensorflow as tf
print('TensorFlow version:', tf.__version__)
print('Keras version:', keras.__version__)
import os
import random
import itertools
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Input, Dense, Activation
from keras.layers import Reshape, Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model, load_model
from keras.layers.recurrent import GRU
from keras.optimizers import SGD
import keras.callbacks
import cv2
sess = tf.Session()
K.set_session(sess)
# A = Account symbol (10)
# B = Amount symbol
# C = Routing symbol
# D = Dash symbol (13)
letters='0123456789ABCD!' # ! blank
img_height = 360
img_width = 540
max_label_len = 30
# Network parameters
conv_filters = 32 # default was 16
kernel_size = (3, 3) # (3, 3)
pool_size = 2 # 2
time_dense_size = 8 # 32
rnn_size = 128 # 512
downsample_factor = pool_size ** 2 # pool_size ** 2
batch_size = 32 # 32
epochs_count = 20
def labels_to_text(labels):
return ''.join(list(map(lambda x: letters[int(x)], labels)))
def text_to_labels(text):
label = np.ones(max_label_len) * -1
for x in range(0, len(text)):
label[x] = letters.index(text[x])
return label
def is_valid_str(s):
for ch in s:
if not ch in letters:
return False
return True
class TextImageGenerator:
def __init__(self,
tag,
img_w, img_h,
batch_size,
downsample_factor,
max_text_len=max_label_len):
self.tag=tag
self.img_h = img_h
self.img_w = img_w
self.batch_size = batch_size
self.max_text_len = max_text_len
self.downsample_factor = downsample_factor
self.samples = []
self.images_dir = 'train-one'
for filename in os.listdir(self.images_dir):
filename_split = filename.split('-')[:2]
name = filename_split[0] + filename_split[1]
self.samples.append([self.images_dir + '/' + filename, name])
self.n = len([name for name in os.listdir(self.images_dir)])
self.indexes = list(range(self.n))
self.cur_index = 0
def build_data(self):
self.imgs = np.zeros((self.n, self.img_h, self.img_w))
self.texts = []
for i, (img_filepath, text) in enumerate(self.samples):
img = cv2.imread(img_filepath, cv2.IMREAD_GRAYSCALE)
img = img.astype(np.float32)
img /= 255
# width and height are backwards from typical Keras convention
# because width is the time dimension when it gets fed into the RNN
self.imgs[i, :, :] = img
self.texts.append(text)
def get_output_size(self):
return len(letters) + 1
def next_sample(self):
self.cur_index += 1
if self.cur_index >= self.n:
self.cur_index = 0
random.shuffle(self.indexes)
return self.imgs[self.indexes[self.cur_index]], self.texts[self.indexes[self.cur_index]]
def next_batch(self):
while True:
# width and height are backwards from typical Keras convention
# because width is the time dimension when it gets fed into the RNN
if K.image_data_format() == 'channels_first':
X_data = np.ones([self.batch_size, 1, self.img_w, self.img_h])
else:
X_data = np.ones([self.batch_size, self.img_w, self.img_h, 1])
Y_data = np.ones([self.batch_size, self.max_text_len])
# input_length (width of image that is fed to the loss function): 133 == 540 / 4 - 2
# is that a problem?
input_length = np.ones((self.batch_size, 1)) * (self.img_w // self.downsample_factor - 2)
label_length = np.zeros((self.batch_size, 1))
source_str = []
for i in range(self.batch_size):
img, text = self.next_sample()
img = img.T
if K.image_data_format() == 'channels_first':
img = np.expand_dims(img, 0)
else:
img = np.expand_dims(img, -1)
X_data[i] = img
Y_data[i] = text_to_labels(text)
source_str.append(text)
label_length[i] = len(text)
inputs = {
'the_input': X_data,
'the_labels': Y_data,
'input_length': input_length,
'label_length': label_length,
#'source_str': source_str
}
outputs = {'ctc': np.zeros([self.batch_size])}
yield (inputs, outputs)
tiger = TextImageGenerator('val', 540, img_height, 8, 4)
tiger.build_data()
for inp, out in tiger.next_batch():
print('Text generator output (data which will be fed into the neutral network):')
print('1) the_input (image)')
if K.image_data_format() == 'channels_first':
img = inp['the_input'][0, 0, :, :]
else:
img = inp['the_input'][0, :, :, 0]
plt.imshow(img.T, cmap='gray')
plt.show()
print('2) the_labels (plate number): %s is encoded as %s' %
(labels_to_text(inp['the_labels'][0]), list(map(int, inp['the_labels'][0]))))
print('3) input_length (width of image that is fed to the loss function): %d == %d / 4 - 2' %
(inp['input_length'][0], tiger.img_w))
print('4) label_length (length of plate number): %d' % inp['label_length'][0])
break
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
# the 2 is critical here since the first couple outputs of the RNN
# tend to be garbage:
y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def train(img_w, load=False):
# Input Parameters
img_h = img_height
if K.image_data_format() == 'channels_first':
input_shape = (1, img_w, img_h)
else:
input_shape = (img_w, img_h, 1)
tiger_train = TextImageGenerator('train', img_w, img_h, batch_size, downsample_factor)
tiger_train.build_data()
tiger_val = TextImageGenerator('val', img_w, img_h, batch_size, downsample_factor)
tiger_val.build_data()
act = 'relu'
input_data = Input(name='the_input', shape=input_shape, dtype='float32')
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal',
name='conv1')(input_data)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal',
name='conv2')(inner)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)
conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)
# cuts down input size going into RNN:
inner = Dense(time_dense_size, activation=act, name='dense1')(inner)
# Two layers of bidirecitonal GRUs
# GRU seems to work as well, if not better than LSTM:
gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
gru1_merged = add([gru_1, gru_1b])
gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)
# transforms RNN output to character activations:
inner = Dense(tiger_train.get_output_size(), kernel_initializer='he_normal',
name='dense2')(concatenate([gru_2, gru_2b]))
y_pred = Activation('softmax', name='softmax')(inner)
Model(inputs=input_data, outputs=y_pred).summary()
labels = Input(name='the_labels', shape=[tiger_train.max_text_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
# Keras doesn't currently support loss funcs with extra parameters
# so CTC loss is implemented in a lambda layer
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
# clipnorm seems to speeds up convergence
sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
if load:
model = load_model('./tmp_model.h5', compile=False)
else:
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
if not load:
# captures output of softmax so we can decode the output during visualization
test_func = K.function([input_data], [y_pred])
# converges to about 51 loss by epoch 6
model.fit_generator(generator=tiger_train.next_batch(),
steps_per_epoch=tiger_train.n // batch_size,
#steps_per_epoch=1,
epochs=epochs_count,
validation_data=tiger_val.next_batch(),
validation_steps=1)
return model
print('------------model train!------------')
model = train(540, load=False)
# model.save('model.h5')
def decode_batch(out):
ret = []
for j in range(out.shape[0]):
out_best = list(np.argmax(out[j, 2:], 1))
out_best = [k for k, g in itertools.groupby(out_best)]
outstr = ''
for c in out_best:
if c < len(letters):
outstr += letters[c]
ret.append(outstr)
return ret
print('------------model validate!------------')
validate_batch_size = 10
tiger_test = TextImageGenerator('test', 540, img_height, validate_batch_size, 4)
tiger_test.build_data()
net_inp = model.get_layer(name='the_input').input
net_out = model.get_layer(name='softmax').output
for inp_value, _ in tiger_test.next_batch():
bs = inp_value['the_input'].shape[0]
X_data = inp_value['the_input']
net_out_value = sess.run(net_out, feed_dict={net_inp:X_data})
pred_texts = decode_batch(net_out_value)
labels = inp_value['the_labels']
texts = []
for label in labels:
text = ''.join(list(map(lambda x: letters[int(x)], label)))
texts.append(text)
print(texts)
for i in range(bs):
fig = plt.figure(figsize=(10, 10))
outer = gridspec.GridSpec(2, 1, wspace=10, hspace=0.1)
ax1 = plt.Subplot(fig, outer[0])
fig.add_subplot(ax1)
ax2 = plt.Subplot(fig, outer[1])
fig.add_subplot(ax2)
print('Predicted: %s\nTrue: %s' % (pred_texts[i], texts[i]))
img = X_data[i][:, :, 0].T
ax1.set_title('Input img')
ax1.imshow(img, cmap='gray')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_title('Activations')
ax2.imshow(net_out_value[i].T, cmap='binary', interpolation='nearest')
ax2.set_yticks(list(range(len(letters) + 1)))
ax2.set_yticklabels(letters)
ax2.grid(False)
for h in np.arange(-0.5, len(letters) + 1 + 0.5, 1):
ax2.axhline(h, linestyle='-', color='k', alpha=0.5, linewidth=1)
#ax.axvline(x, linestyle='--', color='k')
plt.show()
break