Question

我正在使用TensorFlow中的卷积神经网络，并且遇到了丢失层问题。按照建议，我将keep_probability占位符传递给图表，并在训练期间将值设置为0.5，在验证和测试期间将值设置为1.0。在观察培训过程时，结果对验证集很有用。但是，当我在训练后测试网络时，网络出现故障。

更新：当我说网络出现故障时，我的意思是网络不再正确分割图像。在验证期间，网络的mIoU约为80％，但在测试时，它下降到40％左右，并将所有像素分类为其中一个类。在添加丢失层之前，验证和测试集的mIoU都在80％左右。

我不明白为什么网络在验证集上运行时测试集失败了？

我已经添加了培训，测试和图表本身的代码。

网络培训代码：

with tf.Graph().as_default():
   #Probablitity that the neuron's output will be kept during dropout
    keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")

    global_step = tf.Variable(0, trainable=False)

    images, labels = Inputs.datasetInputs(image_filenames, label_filenames, FLAGS.batch_size)
    val_images, val_labels = Inputs.datasetInputs(val_image_filenames, val_label_filenames, FLAGS.batch_size)

    train_data_node = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 3])
    train_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 1])
    phase_train = tf.placeholder(tf.bool, name='phase_train')

    logits = model.inference(train_data_node, phase_train, FLAGS.batch_size, keep_probability) #tensor, nothing calculated yet
    loss = model.cal_loss(logits, train_labels_node)
    # Build a Graph that trains the model with one batch of examples and updates the model parameters.
     train_op = model.train(loss, global_step)

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    """ Starting iterations to train the network """
    for step in range(startstep, startstep + FLAGS.max_steps):
        image_batch ,label_batch = sess.run(fetches=[images, labels])
        # since we still use mini-batches in eval, still set bn-layer phase_train = True
        feed_dict = {
          train_data_node: image_batch,
          train_labels_node: label_batch,
          phase_train: True,
          keep_probability: 0.5
        }

        _, loss_value = sess.run(fetches=[train_op, loss], feed_dict=feed_dict)

        if step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          # eval current training batch pre-class accuracy
          pred = sess.run(fetches=logits, feed_dict=feed_dict)

          Utils.per_class_acc(pred, label_batch)



        if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
          """ Validate training by running validation dataset """
          total_val_loss = 0.0
          hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
          for test_step in range(TEST_ITER):
            val_images_batch, val_labels_batch = sess.run(fetches=[val_images, val_labels])
            feed_dict = {
              train_data_node: val_images_batch,
              train_labels_node: val_labels_batch,
              phase_train: True,
              keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable
            }
            _val_loss, _val_pred = sess.run(fetches=[loss, logits], feed_dict=feed_dict)

 (...)

测试网络的代码：

keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")

  image_filenames, label_filenames = Inputs.get_filename_list(FLAGS.test_dir)
  test_data_node = tf.placeholder(tf.float32, shape=[testing_batch_size, FLAGS.image_h, FLAGS.image_w, FLAGS.image_c])  #360, 480, 3
  test_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.test_batch_size, FLAGS.image_h, FLAGS.image_w, 1])

  phase_train = tf.placeholder(tf.bool, name='phase_train')

  logits = model.inference(test_data_node, phase_train, testing_batch_size, keep_probability)
  loss = model.cal_loss(logits, test_labels_node)
  pred = tf.argmax(logits, dimension=3)

  with tf.Session() as sess:
    # Load checkpoint
    saver.restore(sess, FLAGS.model_ckpt_dir)

    images, labels = Inputs.get_all_test_data(image_filenames, label_filenames)
    threads = tf.train.start_queue_runners(sess=sess)
    hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
    step=0
    for image_batch, label_batch  in zip(images, labels):
      feed_dict = { #maps graph elements to values
        test_data_node: image_batch,
        test_labels_node: label_batch,
        phase_train: False,
        keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable
      }

      dense_prediction, im = sess.run(fetches=[logits, pred], feed_dict=feed_dict)
(...)

图表：

def inference(images, phase_train, batch_size, keep_prob):
  conv1_1 = conv_layer_with_bn(images, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1_1")
  conv1_2 = conv_layer_with_bn(conv1_1, [7, 7, 64, 64], phase_train, name="conv1_2")
  dropout1 = tf.layers.dropout(conv1_2, rate=(1-keep_prob), training=phase_train, name="dropout1")
  pool1, pool1_indices = tf.nn.max_pool_with_argmax(dropout1, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool1')
  conv2_1 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2_1")
  conv2_2 = conv_layer_with_bn(conv2_1, [7, 7, 64, 64], phase_train, name="conv2_2")
  dropout2 = tf.layers.dropout(conv2_2, rate=(1-keep_prob), training=phase_train, name="dropout2")
  pool2, pool2_indices = tf.nn.max_pool_with_argmax(dropout2, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool2')
  conv3_1 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3_1")
  conv3_2 = conv_layer_with_bn(conv3_1, [7, 7, 64, 64], phase_train, name="conv3_2")
  conv3_3 = conv_layer_with_bn(conv3_2, [7, 7, 64, 64], phase_train, name="conv3_3")
  dropout3 = tf.layers.dropout(conv3_3, rate=(1-keep_prob), training=phase_train, name="dropout3")
  pool3, pool3_indices = tf.nn.max_pool_with_argmax(dropout3, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool3')
  conv4_1 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4_1")
  conv4_2 = conv_layer_with_bn(conv4_1, [7, 7, 64, 64], phase_train, name="conv4_2")
  conv4_3 = conv_layer_with_bn(conv4_2, [7, 7, 64, 64], phase_train, name="conv4_3")
  dropout4 = tf.layers.dropout(conv4_3, rate=(1-keep_prob), training=phase_train, name="dropout4")
  pool4, pool4_indices = tf.nn.max_pool_with_argmax(dropout4, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool4')
  conv5_1 = conv_layer_with_bn(pool4, [7, 7, 64, 64], phase_train, name="conv5_1")
  conv5_2 = conv_layer_with_bn(conv5_1, [7, 7, 64, 64], phase_train, name="conv5_2")
  conv5_3 = conv_layer_with_bn(conv5_2, [7, 7, 64, 64], phase_train, name="conv5_3")
  dropout5 = tf.layers.dropout(conv5_3, rate=(1-keep_prob), training=phase_train, name="dropout5")
  pool5, pool5_indices = tf.nn.max_pool_with_argmax(dropout5, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool5')
  """ End of encoder """

  """ Start decoder """
  dropout5_decode = tf.layers.dropout(pool5, rate=(1-keep_prob), training=phase_train, name="dropout5_decode")
  upsample5 = deconv_layer(dropout5_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//16, FLAGS.image_w//16, 64], 2, "up5")
  conv_decode5_1 = conv_layer_with_bn(upsample5, [7, 7, 64, 64], phase_train, True, name="conv_decode5_1")
  conv_decode5_2 = conv_layer_with_bn(conv_decode5_1, [7, 7, 64, 64], phase_train, True, name="conv_decode5_2")
  conv_decode5_3 = conv_layer_with_bn(conv_decode5_2, [7, 7, 64, 64], phase_train, True, name="conv_decode5_3")

  dropout4_decode = tf.layers.dropout(conv_decode5_3, rate=(1-keep_prob), training=phase_train, name="dropout4_decode")
  upsample4 = deconv_layer(dropout4_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4")
  conv_decode4_1 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, True, name="conv_decode4_1")
  conv_decode4_2 = conv_layer_with_bn(conv_decode4_1, [7, 7, 64, 64], phase_train, True, name="conv_decode4_2")
  conv_decode4_3 = conv_layer_with_bn(conv_decode4_2, [7, 7, 64, 64], phase_train, True, name="conv_decode4_3")

  dropout3_decode = tf.layers.dropout(conv_decode4_3, rate=(1-keep_prob), training=phase_train, name="dropout3_decode")
  upsample3 = deconv_layer(dropout3_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3")
  conv_decode3_1 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, True, name="conv_decode3_1")
  conv_decode3_2 = conv_layer_with_bn(conv_decode3_1, [7, 7, 64, 64], phase_train, True, name="conv_decode3_2")
  conv_decode3_3 = conv_layer_with_bn(conv_decode3_2, [7, 7, 64, 64], phase_train, True, name="conv_decode3_3")

  dropout2_decode = tf.layers.dropout(conv_decode3_3, rate=(1-keep_prob), training=phase_train, name="dropout2_decode")
  upsample2= deconv_layer(dropout2_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2")
  conv_decode2_1 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, True, name="conv_decode2_1")
  conv_decode2_2 = conv_layer_with_bn(conv_decode2_1, [7, 7, 64, 64], phase_train, True, name="conv_decode2_2")

  dropout1_decode = tf.layers.dropout(conv_decode2_2, rate=(1-keep_prob), training=phase_train, name="dropout1_deconv")
  upsample1 = deconv_layer(dropout1_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1")
  conv_decode1_1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_1")
  conv_decode1_2 = conv_layer_with_bn(conv_decode1_1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_2")
  """ End of decoder """

  """ Start Classify """
  # output predicted class number (2)
  with tf.variable_scope('conv_classifier') as scope:
    shape=[1, 1, 64, FLAGS.num_class]
    kernel = _variable_with_weight_decay('weights', shape=shape, initializer=tf.contrib.layers.variance_scaling_initializer(), #orthogonal_initializer()
                                           wd=None)

    conv = tf.nn.conv2d(conv_decode1_2, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0))
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) #tf.nn.bias_add is an activation function. Simple add that specifies 1-D tensor bias
    #logit = conv_classifier = prediction
  return conv_classifier

Answer 1

我最终发现我的问题来自批量规范化未能正确实施的问题。

问题是我没有正确更新moving_mean和moving_variance。这在batch_norm的TensorFlow docs中指出：

注意：训练时，moving_mean和moving_variance需要更新。默认情况下，更新操作位于 tf.GraphKeys.UPDATE_OPS，因此需要将它们作为依赖项添加到 train_op。

因此，我的代码如下所示：

def training(loss):

    global_step = tf.Variable(0, name='global_step', trainable=False)

    #This motif is needed to hook up the batch_norm updates to the training
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        if(FLAGS.optimizer == "SGD"):
            print("Running with SGD optimizer")
            optimizer = tf.train.GradientDescentOptimizer(0.1)
        elif(FLAGS.optimizer == "adam"):
            print("Running with adam optimizer")
            optimizer = tf.train.AdamOptimizer(0.001)
        elif(FLAGS.optimizer == "adagrad"):
            print("Running with adagrad optimizer")
            optimizer = tf.train.AdagradOptimizer(0.01)
        else:
            raise ValueError("optimizer was not recognized.")

        train_op = optimizer.minimize(loss=loss, global_step=global_step)
    return train_op, global_step

我的网络的完整实施可以在这里看到：https://github.com/mathildor/TF-SegNet

为什么添加dropout层对验证集起作用，而不是在测试集上？

1 个答案: