我正在使用TensorFlow中的卷积神经网络,并且遇到了丢失层问题。按照建议,我将keep_probability占位符传递给图表,并在训练期间将值设置为0.5,在验证和测试期间将值设置为1.0。在观察培训过程时,结果对验证集很有用。但是,当我在训练后测试网络时,网络出现故障。
更新:当我说网络出现故障时,我的意思是网络不再正确分割图像。在验证期间,网络的mIoU约为80%,但在测试时,它下降到40%左右,并将所有像素分类为其中一个类。在添加丢失层之前,验证和测试集的mIoU都在80%左右。
我不明白为什么网络在验证集上运行时测试集失败了?
我已经添加了培训,测试和图表本身的代码。
网络培训代码:
with tf.Graph().as_default():
#Probablitity that the neuron's output will be kept during dropout
keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")
global_step = tf.Variable(0, trainable=False)
images, labels = Inputs.datasetInputs(image_filenames, label_filenames, FLAGS.batch_size)
val_images, val_labels = Inputs.datasetInputs(val_image_filenames, val_label_filenames, FLAGS.batch_size)
train_data_node = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 3])
train_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 1])
phase_train = tf.placeholder(tf.bool, name='phase_train')
logits = model.inference(train_data_node, phase_train, FLAGS.batch_size, keep_probability) #tensor, nothing calculated yet
loss = model.cal_loss(logits, train_labels_node)
# Build a Graph that trains the model with one batch of examples and updates the model parameters.
train_op = model.train(loss, global_step)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
""" Starting iterations to train the network """
for step in range(startstep, startstep + FLAGS.max_steps):
image_batch ,label_batch = sess.run(fetches=[images, labels])
# since we still use mini-batches in eval, still set bn-layer phase_train = True
feed_dict = {
train_data_node: image_batch,
train_labels_node: label_batch,
phase_train: True,
keep_probability: 0.5
}
_, loss_value = sess.run(fetches=[train_op, loss], feed_dict=feed_dict)
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
# eval current training batch pre-class accuracy
pred = sess.run(fetches=logits, feed_dict=feed_dict)
Utils.per_class_acc(pred, label_batch)
if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
""" Validate training by running validation dataset """
total_val_loss = 0.0
hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
for test_step in range(TEST_ITER):
val_images_batch, val_labels_batch = sess.run(fetches=[val_images, val_labels])
feed_dict = {
train_data_node: val_images_batch,
train_labels_node: val_labels_batch,
phase_train: True,
keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable
}
_val_loss, _val_pred = sess.run(fetches=[loss, logits], feed_dict=feed_dict)
(...)
测试网络的代码:
keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")
image_filenames, label_filenames = Inputs.get_filename_list(FLAGS.test_dir)
test_data_node = tf.placeholder(tf.float32, shape=[testing_batch_size, FLAGS.image_h, FLAGS.image_w, FLAGS.image_c]) #360, 480, 3
test_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.test_batch_size, FLAGS.image_h, FLAGS.image_w, 1])
phase_train = tf.placeholder(tf.bool, name='phase_train')
logits = model.inference(test_data_node, phase_train, testing_batch_size, keep_probability)
loss = model.cal_loss(logits, test_labels_node)
pred = tf.argmax(logits, dimension=3)
with tf.Session() as sess:
# Load checkpoint
saver.restore(sess, FLAGS.model_ckpt_dir)
images, labels = Inputs.get_all_test_data(image_filenames, label_filenames)
threads = tf.train.start_queue_runners(sess=sess)
hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
step=0
for image_batch, label_batch in zip(images, labels):
feed_dict = { #maps graph elements to values
test_data_node: image_batch,
test_labels_node: label_batch,
phase_train: False,
keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable
}
dense_prediction, im = sess.run(fetches=[logits, pred], feed_dict=feed_dict)
(...)
图表:
def inference(images, phase_train, batch_size, keep_prob):
conv1_1 = conv_layer_with_bn(images, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1_1")
conv1_2 = conv_layer_with_bn(conv1_1, [7, 7, 64, 64], phase_train, name="conv1_2")
dropout1 = tf.layers.dropout(conv1_2, rate=(1-keep_prob), training=phase_train, name="dropout1")
pool1, pool1_indices = tf.nn.max_pool_with_argmax(dropout1, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool1')
conv2_1 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2_1")
conv2_2 = conv_layer_with_bn(conv2_1, [7, 7, 64, 64], phase_train, name="conv2_2")
dropout2 = tf.layers.dropout(conv2_2, rate=(1-keep_prob), training=phase_train, name="dropout2")
pool2, pool2_indices = tf.nn.max_pool_with_argmax(dropout2, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
conv3_1 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3_1")
conv3_2 = conv_layer_with_bn(conv3_1, [7, 7, 64, 64], phase_train, name="conv3_2")
conv3_3 = conv_layer_with_bn(conv3_2, [7, 7, 64, 64], phase_train, name="conv3_3")
dropout3 = tf.layers.dropout(conv3_3, rate=(1-keep_prob), training=phase_train, name="dropout3")
pool3, pool3_indices = tf.nn.max_pool_with_argmax(dropout3, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool3')
conv4_1 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4_1")
conv4_2 = conv_layer_with_bn(conv4_1, [7, 7, 64, 64], phase_train, name="conv4_2")
conv4_3 = conv_layer_with_bn(conv4_2, [7, 7, 64, 64], phase_train, name="conv4_3")
dropout4 = tf.layers.dropout(conv4_3, rate=(1-keep_prob), training=phase_train, name="dropout4")
pool4, pool4_indices = tf.nn.max_pool_with_argmax(dropout4, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool4')
conv5_1 = conv_layer_with_bn(pool4, [7, 7, 64, 64], phase_train, name="conv5_1")
conv5_2 = conv_layer_with_bn(conv5_1, [7, 7, 64, 64], phase_train, name="conv5_2")
conv5_3 = conv_layer_with_bn(conv5_2, [7, 7, 64, 64], phase_train, name="conv5_3")
dropout5 = tf.layers.dropout(conv5_3, rate=(1-keep_prob), training=phase_train, name="dropout5")
pool5, pool5_indices = tf.nn.max_pool_with_argmax(dropout5, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool5')
""" End of encoder """
""" Start decoder """
dropout5_decode = tf.layers.dropout(pool5, rate=(1-keep_prob), training=phase_train, name="dropout5_decode")
upsample5 = deconv_layer(dropout5_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//16, FLAGS.image_w//16, 64], 2, "up5")
conv_decode5_1 = conv_layer_with_bn(upsample5, [7, 7, 64, 64], phase_train, True, name="conv_decode5_1")
conv_decode5_2 = conv_layer_with_bn(conv_decode5_1, [7, 7, 64, 64], phase_train, True, name="conv_decode5_2")
conv_decode5_3 = conv_layer_with_bn(conv_decode5_2, [7, 7, 64, 64], phase_train, True, name="conv_decode5_3")
dropout4_decode = tf.layers.dropout(conv_decode5_3, rate=(1-keep_prob), training=phase_train, name="dropout4_decode")
upsample4 = deconv_layer(dropout4_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4")
conv_decode4_1 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, True, name="conv_decode4_1")
conv_decode4_2 = conv_layer_with_bn(conv_decode4_1, [7, 7, 64, 64], phase_train, True, name="conv_decode4_2")
conv_decode4_3 = conv_layer_with_bn(conv_decode4_2, [7, 7, 64, 64], phase_train, True, name="conv_decode4_3")
dropout3_decode = tf.layers.dropout(conv_decode4_3, rate=(1-keep_prob), training=phase_train, name="dropout3_decode")
upsample3 = deconv_layer(dropout3_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3")
conv_decode3_1 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, True, name="conv_decode3_1")
conv_decode3_2 = conv_layer_with_bn(conv_decode3_1, [7, 7, 64, 64], phase_train, True, name="conv_decode3_2")
conv_decode3_3 = conv_layer_with_bn(conv_decode3_2, [7, 7, 64, 64], phase_train, True, name="conv_decode3_3")
dropout2_decode = tf.layers.dropout(conv_decode3_3, rate=(1-keep_prob), training=phase_train, name="dropout2_decode")
upsample2= deconv_layer(dropout2_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2")
conv_decode2_1 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, True, name="conv_decode2_1")
conv_decode2_2 = conv_layer_with_bn(conv_decode2_1, [7, 7, 64, 64], phase_train, True, name="conv_decode2_2")
dropout1_decode = tf.layers.dropout(conv_decode2_2, rate=(1-keep_prob), training=phase_train, name="dropout1_deconv")
upsample1 = deconv_layer(dropout1_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1")
conv_decode1_1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_1")
conv_decode1_2 = conv_layer_with_bn(conv_decode1_1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_2")
""" End of decoder """
""" Start Classify """
# output predicted class number (2)
with tf.variable_scope('conv_classifier') as scope:
shape=[1, 1, 64, FLAGS.num_class]
kernel = _variable_with_weight_decay('weights', shape=shape, initializer=tf.contrib.layers.variance_scaling_initializer(), #orthogonal_initializer()
wd=None)
conv = tf.nn.conv2d(conv_decode1_2, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0))
conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) #tf.nn.bias_add is an activation function. Simple add that specifies 1-D tensor bias
#logit = conv_classifier = prediction
return conv_classifier
答案 0 :(得分:0)
我最终发现我的问题来自批量规范化未能正确实施的问题。
问题是我没有正确更新moving_mean和moving_variance。这在batch_norm的TensorFlow docs中指出:
注意:训练时,moving_mean和moving_variance需要 更新。默认情况下,更新操作位于 tf.GraphKeys.UPDATE_OPS,因此需要将它们作为依赖项添加到 train_op。
因此,我的代码如下所示:
def training(loss):
global_step = tf.Variable(0, name='global_step', trainable=False)
#This motif is needed to hook up the batch_norm updates to the training
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
if(FLAGS.optimizer == "SGD"):
print("Running with SGD optimizer")
optimizer = tf.train.GradientDescentOptimizer(0.1)
elif(FLAGS.optimizer == "adam"):
print("Running with adam optimizer")
optimizer = tf.train.AdamOptimizer(0.001)
elif(FLAGS.optimizer == "adagrad"):
print("Running with adagrad optimizer")
optimizer = tf.train.AdagradOptimizer(0.01)
else:
raise ValueError("optimizer was not recognized.")
train_op = optimizer.minimize(loss=loss, global_step=global_step)
return train_op, global_step
我的网络的完整实施可以在这里看到:https://github.com/mathildor/TF-SegNet