Question

下面的代码片段是我的代码。我使用队列加载训练数据，并使用Feed加载验证图像。随着培训过程，培训损失和培训准确性恰到好处。但是，至于验证阶段，验证损失和准确性只是奇怪。验证损失太高，无论我运行多少步，验证准确性太低，就像随机猜测一样。但是，当我设置＆＃39; is_training＆＃39;函数load_validate_img_data中的参数为True而不是False，验证损失和准确性正确。使用batch_norm有什么问题吗？

def inference(inputs,
            num_classes=1000,
            is_training=True,
            dropout_keep_prob=0.5,
            reuse = None,
            scope='alexnet'):




      with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        normalizer_fn=slim.batch_norm,
                        activation_fn=tf.nn.relu,
                        biases_initializer=tf.constant_initializer(0.1),
                        weights_regularizer=slim.l2_regularizer(WEIGHT_DECAY),
                        normalizer_params={'is_training': is_training, 
                        'decay': 0.95, 'reuse':reuse, 'scope': scope}):

            with slim.arg_scope([slim.conv2d], padding='SAME'):

                with slim.arg_scope([slim.max_pool2d], padding='VALID') :

                    with tf.variable_scope(scope, [inputs],reuse = reuse) as sc:



                            net = slim.conv2d(inputs, 32, [3, 3],2, scope='conv1', padding='VALID')
                            net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')

                            net = slim.conv2d(net, 64, [3, 3], scope='conv2')
                            net = slim.max_pool2d(net, [2, 2], 2, scope='pool2')

                            net = slim.conv2d(net, 128, [2, 2], scope='conv3')
                            net = slim.max_pool2d(net, [2, 2], 2, scope='pool3')

                            net = slim.conv2d(net, 256, [2, 2], scope='conv4')
                            net = slim.max_pool2d(net, [2, 2], 2, scope='pool4')

                            net = slim.conv2d(net, 512, [2, 2], scope='conv5')
                            net = slim.avg_pool2d(net, [2, 2],  scope='pool5')



                            net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6')

                            net = slim.conv2d(net, num_classes,[1,1]  ,activation_fn = None, normalizer_fn = None, scope='fc7')
                            net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
                            end_points = net

                            return net, end_points

def get_softmax_loss(logits, labels, name = 'train'):


    one_hot_labels = slim.one_hot_encoding(labels, LABEL_NUM)

    softmax_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = one_hot_labels, logits = logits))

    vars = tf.trainable_variables()
    regularization_loss = tf.add_n([tf.nn.l2_loss(v) for v in vars]) * 0.0005


    total_loss = softmax_loss + regularization_loss



    return total_loss

def get_train_op(loss):


    lr_in_use = tf.Variable(0.01, trainable=False)
    with tf.name_scope('lr_update'):       
        lr_update = tf.assign(lr_in_use, tf.maximum(lr_in_use*0.5, 0.000001))


    optimizer = tf.train.MomentumOptimizer(lr_in_use, 0.9)


    step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False)
    train_op = slim.learning.create_train_op(loss, optimizer, global_step = step)



    loss_update = loss
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if update_ops:
        updates = tf.group(*update_ops)
        loss_update = control_flow_ops.with_dependencies([updates], loss)



    return train_op, loss_update, lr_update

def get_train_acc(logits, labels, name = 'train'):

    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.arg_max(logits, 1), labels), tf.float32))

    return accuracy

def load_validate_img_data():



    validate_img_root = '~/data/'

    img_roots = glob(validate_img_root + '*.bmp')

    validate_img = []
    validate_label = []
    read_count = 0
    for root in img_roots:

        if read_count == 400:
            break

        label_root = root.split('/')
        validate_label.append(label_root[-1][:-4])
        validate_img.append(cv2.imread(root))

        read_count += 1



    validate_img = np.array(validate_img).astype(np.float32)
    validate_label = np.array(validate_label).astype(np.int64)


    with tf.name_scope('validate_input'):
        input_imgs = tf.placeholder(tf.float32, shape = (100, ORIGINAL_SIZE[0], ORIGINAL_SIZE[1], CHANNELS), name = 'imgs')
        input_labels = tf.placeholder(tf.int64, shape = (100), name = 'labels')
    transfer_input_imgs = ut._resize_crop_img(input_imgs, RESIZE_TO, RESIZE_TO, process_type = 'validate')



    logits, out_data = face_train.inference(transfer_input_imgs,  num_classes=LABEL_NUM, is_training = False, reuse = True)


    validate_accuracy = get_train_acc(logits, input_labels, name = 'validate')
    validate_loss = get_softmax_loss(logits, input_labels, name = 'validate')



    return validate_img, validate_label, input_imgs, input_labels, validate_accuracy, validate_loss

with tf.Graph().as_default():



    images, labels = ut._load_batch_t(data_dir, ORIGINAL_SIZE, CHANNELS, BATCH_SIZE, RESIZE_TO, RESIZE_TO)


    logits= face_train.inference(images,  num_classes=LABEL_NUM)


    accuracy = get_train_acc(logits, labels)

    total_loss = get_softmax_loss(logits, labels)
    train_op, loss_update, lr_update = get_train_op(total_loss)




    validate_img, validate_label, img_placeholer,label_placeholder, validate_accuracy, validate_loss = load_validate_img_data()



    with tf.Session() as sess:



        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())


        saver = tf.train.Saver(tf.global_variables(), max_to_keep=10000)





        coord = tf.train.Coordinator()  
        threads = tf.train.start_queue_runners(coord=coord) 

        total_step = 0
        epoc_step = int(SAMPLE_NUM/BATCH_SIZE)
        for epoc in range(EPOC_NUM):
            for step in range(epoc_step):



                _ = sess.run([train_op])


                if total_step % 20 == 0:
                    loss, train_ac  =sess.run([loss_update, accuracy])
                    print ('epoc : %d, step : %d, train_loss : %.2f, train_acc: %.3f' %(epoc, step, loss, train_ac))




                if total_step % 200 == 0:
                    all_va_acc = 0
                    all_va_loss = 0

                    for i in range(4):

                        feed_dict = {img_placeholer: validate_img[i*100 : (i+1)*100],  \
                       label_placeholder: validate_label[i*100 : (i+1)*100]}

                        va_acc, va_loss, summary_val= sess.run([validate_accuracy, validate_loss, merged_val ], feed_dict = feed_dict)
                        all_va_acc += va_acc
                        all_va_loss += va_loss


                    print ('validate_accuracy: %.2f,  validate_loss: %.2f' % (all_va_acc/4.0, all_va_loss/4.0))



                total_step += 1




        coord.request_stop()
        coord.join(threads)

Answer 1

在推理期间，使用批量规范moving average mean和moving average variance，因此您需要将参数is_training设置为False。

def inference(inputs,
        num_classes=1000,
        is_training=False,
        dropout_keep_prob=0.5,
        reuse = None,
        scope='alexnet'):

Answer 2

如果在is_training=False中使用tf.layers.batch_normalization()进行评估时，仍然会得到很大的损耗值和很小的精度值，则需要牢记以下几点：

注意：训练时，需要更新moving_mean和moving_variance。默认情况下，更新操作位于tf.GraphKeys.UPDATE_OPS中，因此需要将它们作为依赖项添加到train_optf.GraphKeys.UPDATE_OPS中，因此需要将它们作为依赖项添加到train_op

要解决此问题，您需要将train_op包装在包含依赖项的范围内，如下所示：

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss=loss, global_step=global_step)

来源：https://github.com/tensorflow/tensorflow/issues/16455

Tensorflow批量规范导致培训损失和验证损失之间的不平衡？

2 个答案: