首先:我是深度学习和Tensorflow的新手,对不起这些愚蠢的问题。也许有人可以帮助我获得更多的了解和清晰度。 我在OCR项目上工作,在那里我只有4000张带虚线字体和白色背景的带标签的图像。 我决定使用以下代码来解决任务:https://github.com/Pay20Y/FOTS_TF
我在Synthtext数据集上使用了预先训练的模型,并继续使用自己的数据集进行训练,但是结果并不那么好。 我认为一个问题可能是我的自定义数据集的类数少于预先训练的模型的类数。 我读过有关转移学习的知识,您只训练最后一层,然后可以使用不同数量的类,但是我不知道该怎么做。因此,更确切地说,我不知道如何识别图的最后一层?
以下是构建图形的代码:
def cnn(self, rois):
with tf.variable_scope("recog/cnn"):
conv1 = slim.conv2d(rois, 64, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
conv1 = slim.conv2d(conv1, 64, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
pool1 = slim.max_pool2d(conv1, [2, 1], stride=[2, 1])
conv2 = slim.conv2d(pool1, 128, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
conv2 = slim.conv2d(conv2, 128, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
pool2 = slim.max_pool2d(conv2, [2, 1], stride=[2, 1])
conv3 = slim.conv2d(pool2, 256, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
conv3 = slim.conv2d(conv3, 256, 3, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=None)
pool3 = slim.max_pool2d(conv3, [2, 1], stride=[2, 1])
return pool3
def bilstm(self, input_feature, seq_len):
with tf.variable_scope("recog/rnn"):
lstm_fw_cell = rnn.LSTMCell(self.rnn_hidden_num)
lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_fw_cell, input_keep_prob=self.keepProb, output_keep_prob=self.keepProb)
lstm_bw_cell = rnn.LSTMCell(self.rnn_hidden_num)
lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_bw_cell, input_keep_prob=self.keepProb, output_keep_prob=self.keepProb)
# infer_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input_feature, seq_len, dtype=tf.float32)
# infer_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input_feature, sequence_length=seq_len, time_major=True, dtype=tf.float32)
infer_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input_feature, sequence_length=seq_len, dtype=tf.float32)
# stack_lstm_layer, _, _ = rnn.stack_bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input_feature, dtype=tf.float32)
infer_output = tf.concat(infer_output, axis=-1)
return infer_output
# return stack_lstm_layer
def build_graph(self, rois, seq_len):
num_rois = tf.shape(rois)[0]
cnn_feature = self.cnn(rois) # N * 1 * W * C
print cnn_feature
cnn_feature = tf.reshape(cnn_feature, [nums, -1, 256]) # squeeze B x W x C
cnn_feature = tf.squeeze(cnn_feature, axis=1) # N * W * C
reshape_cnn_feature = tf.transpose(cnn_feature, (1, 0, 2))
reshape_cnn_feature = cnn_feature
# print "final cnn: ", reshape_cnn_feature.shape
lstm_output = self.bilstm(reshape_cnn_feature, seq_len) # N * T * 2H
# print "lstm_output: ", lstm_output
logits = tf.reshape(lstm_output, [-1, self.rnn_hidden_num * 2]) # (N * T) * 2H
W = tf.Variable(tf.truncated_normal([self.rnn_hidden_num * 2, self.num_classes], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0., shape=[self.num_classes]), name="b")
logits = tf.matmul(logits, W) + b # (N * T) * Class
logits = tf.reshape(logits, [num_rois, -1, self.num_classes])
logits = tf.reshape(logits, [nums, -1, self.num_classes])
logits = tf.reshape(logits, [num_rois, -1, self.num_classes])
logits = tf.transpose(logits, (1, 0, 2))
return logits
我不知道进行迁移学习是否明智? 还有其他方法可以在我自己的数据集上使用预训练的模型和微调,但还要使用其他数量的字符类? 我真的很困惑如何解决此任务以及如何尝试改善结果。我已经尝试了从没有预训练模型的情况下从头开始训练,但是比模型过拟合更容易。