Question

我正在尝试过度拟合这种架构的手写识别模型：

features = slim.conv2d(features, 16, [3, 3])
features = slim.max_pool2d(features, 2)
features = mdrnn(features, 16)
features = slim.conv2d(features, 32, [3, 3])
features = slim.max_pool2d(features, 2)
features = mdrnn(features, 32)
features = slim.conv2d(features, 64, [3, 3])
features = slim.max_pool2d(features, 2)
features = mdrnn(features, 64)
features = slim.conv2d(features, 128, [3, 3])
features = mdrnn(features, 128)
features = slim.max_pool2d(features, 2)
features = slim.conv2d(features, 256, [3, 3])
features = slim.max_pool2d(features, 2)
features = mdrnn(features, 256)
features = _reshape_to_rnn_dims(features)
features = bidirectional_rnn(features, 128)
features = bidirectional_rnn(features, 128)
features = bidirectional_rnn(features, 128)
features = bidirectional_rnn(features, 128)
features = bidirectional_rnn(features, 128)

使用tensorflow的mdrnn代码（稍作修改）：

def mdrnn(inputs, num_hidden):
    with tf.variable_scope(scope, "multidimensional_rnn", [inputs]):
        hidden_sequence_horizontal = _bidirectional_rnn_scan(inputs,
                                                             num_hidden // 2)
        with tf.variable_scope("vertical"):
            transposed = tf.transpose(hidden_sequence_horizontal, [0, 2, 1, 3])
            output_transposed = _bidirectional_rnn_scan(transposed, num_hidden // 2)
        output = tf.transpose(output_transposed, [0, 2, 1, 3])
        return output

def _bidirectional_rnn_scan(inputs, num_hidden):
    with tf.variable_scope("BidirectionalRNN", [inputs]):
        height = inputs.get_shape().as_list()[1]
        inputs = images_to_sequence(inputs)
        output_sequence = bidirectional_rnn(inputs, num_hidden)
        output = sequence_to_images(output_sequence, height)
        return output

def images_to_sequence(inputs):
    _, _, width, num_channels = _get_shape_as_list(inputs)
    s = tf.shape(inputs)
    batch_size, height = s[0], s[1]
    return tf.reshape(inputs, [batch_size * height, width, num_channels])

def sequence_to_images(tensor, height):
    num_batches, width, depth = tensor.get_shape().as_list()
    if num_batches is None:
        num_batches = -1
    else:
        num_batches = num_batches // height
    reshaped = tf.reshape(tensor,
                          [num_batches, width, height, depth])
    return tf.transpose(reshaped, [0, 2, 1, 3])

def bidirectional_rnn(inputs, num_hidden, concat_output=True,
                      scope=None):
    with tf.variable_scope(scope, "bidirectional_rnn", [inputs]):
        cell_fw = rnn.LSTMCell(num_hidden)
        cell_bw = rnn.LSTMCell(num_hidden)
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                     cell_bw,
                                                     inputs,
                                                     dtype=tf.float32)
        if concat_output:
            return tf.concat(outputs, 2)
        return outputs

训练ctc_loss减少但即使在一千个时期之后它也不会收敛。标签错误率只是波动。

我预处理图像，使其看起来像这样：

我还注意到网络在某些方面产生了相同的预测：

INFO:tensorflow:outputs = [[51 42 70 42 34 42 34 42 34 29 42 29 42 29 42 29 42 29 42 29 42 29 42 29
  42 29  4 72 42 58 20]] (1.156 sec)
INFO:tensorflow:labels = [[38 78 52 29 70 51 78  8  1 78 15  8  1 22 78 52  4 24 78 28  3  9  8 15
  11 14 13 13 78  2  4  1 16]] (1.156 sec)
INFO:tensorflow:label_error_rate = 0.93939394 (1.156 sec)
INFO:tensorflow:global_step/sec: 0.888003
INFO:tensorflow:outputs = [[51 42 70 42 34 42 34 42 34 29 42 29 42 29 42 29 42 29 42 29 42 29 42 29
  42 29  4 65 42 58 20]] (1.126 sec)
INFO:tensorflow:labels = [[38 78 52 29 70 51 78  8  1 78 15  8  1 22 78 52  4 24 78 28  3  9  8 15
  11 14 13 13 78  2  4  1 16]] (1.126 sec)
INFO:tensorflow:label_error_rate = 0.969697 (1.126 sec)
INFO:tensorflow:global_step/sec: 0.866796
INFO:tensorflow:outputs = [[51 42 70 42 34 42 34 42 34 29 42 29 42 29 42 29 42 29 42 29 42 29 42 29
  42 29  4 65 42 58 20]] (1.154 sec)
INFO:tensorflow:labels = [[38 78 52 29 70 51 78  8  1 78 15  8  1 22 78 52  4 24 78 28  3  9  8 15
  11 14 13 13 78  2  4  1 16]] (1.154 sec)
INFO:tensorflow:label_error_rate = 0.969697 (1.154 sec)
INFO:tensorflow:global_step/sec: 0.88832
INFO:tensorflow:outputs = [[51 42 70 42 34 42 34 42 34 29 42 29 42 29 42 29 42 29 42 29 42 29 42 29
  42 29  4 65 42 58 20]] (1.126 sec)
INFO:tensorflow:labels = [[38 78 52 29 70 51 78  8  1 78 15  8  1 22 78 52  4 24 78 28  3  9  8 15
  11 14 13 13 78  2  4  1 16]] (1.126 sec)
INFO:tensorflow:label_error_rate = 0.969697 (1.126 sec)

出现这种情况的原因是什么？这是一个可重复的小例子我已经https://github.com/selcouthlyBlue/CNN-LSTM-CTC-HIGH-LOSS

更新

当我改变转换时：

outputs = tf.reshape(inputs, [-1, num_outputs])
logits = slim.fully_connected(outputs, num_classes)
logits = tf.reshape(logits, [num_steps, -1, num_classes])

对此：

outputs = tf.reshape(inputs, [-1, num_outputs])
logits = slim.fully_connected(outputs, num_classes)
logits = tf.reshape(logits, [-1, num_steps, num_classes])
logits = tf.transpose(logits, (1, 0, 2))

表现有所改善：

（删除了mdrnn图层）