Question

批量标准化不保存其移动平均值和移动方差

当我训练时，我的训练数据完全过度拟合（如预期的那样）。通过批量标准化，培训更快，也如预期的那样。但是，在训练步骤之后，我立即在相同数据上运行相同模型并使用“is_training”= False，这会产生非常低劣的结果。此外，每次我查看moving_mean和moving_variance时，它们都是默认值。他们从不更新。

(u'main/y/y/moving_mean:0', array([ 0.,  0.], dtype=float32))   
(u'main/y/y/moving_variance:0', array([ 1.,  1.], dtype=float32)) \
(u'main/y/y/moving_mean:0', array([ 0.,  0.], dtype=float32))
(u'main/y/y/moving_variance:0', array([ 1.,  1.], dtype=float32))
700 with generated means (training = true} 1.0 with saved means {training = false} 0.4911

我有update_ops代码，但它似乎没有做到这一点。 update_collections = None使它起作用，但我被告知这是出于性能原因的次优解决方案。

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if update_ops:
        updates = tf.group(*update_ops)
        cost = with_dependencies([updates], cost)

我的代码在

下面

import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected, softmax, batch_norm
from tensorflow.python.ops.control_flow_ops import with_dependencies
from tensorflow.python.training.adam import AdamOptimizer

batch_size = 100
input_size = 10
noise_strength = 4

class Data(object):
    def __init__(self,obs,gold):
        self.obs=obs
        self.gold=gold

def generate_data(batch_size,input_size,noise_strength):
    input = np.random.rand(batch_size, input_size) * noise_strength
    gold = np.random.randint(0, 2, (input_size,1))
    input = input + gold
    return Data(input,gold)


def ffnn_model(inputs,num_classes,batch_size,is_training,reuse=False):
    output = fully_connected(inputs,
                             num_classes * 2,
                             activation_fn=None,
                             normalizer_fn=batch_norm,
                             normalizer_params={'is_training': is_training, 'reuse': reuse, 'scope': 'y'},
                             reuse=reuse,
                             scope='y'
                             )
    y = softmax(tf.reshape(output, [batch_size, num_classes, 2]))
    return y


#objective function
def objective_function(y,gold):
    indices = tf.stack([tf.range(tf.size(gold)),tf.reshape(gold,[-1])],axis=1)
    scores = tf.gather_nd(tf.reshape(y,[-1,2]),indices=indices)
    # return tf.cast(indices,tf.float32),-tf.reduce_mean(tf.log(scores+1e-6))
    return -tf.reduce_mean(tf.log(scores+1e-6))

def train_op(y,gold):
    cost = objective_function(y,gold)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if update_ops:
        print "yes to update_ops"
        print update_ops
        updates = tf.group(*update_ops)
        cost = with_dependencies([updates], cost)
    train_step = AdamOptimizer().minimize(cost)

    return train_step

def predictions_op(y):
    return tf.cast(tf.argmax(y, axis=len(y.get_shape()) - 1), dtype=tf.int32)

def accuracy_op(y,gold):
    return tf.reduce_mean(tf.cast(tf.equal(predictions_op(y), gold),tf.float32))

def model(batch_size, num_classes, input_size, scope, reuse):
    with tf.variable_scope(scope) as m:
        if reuse:
            m.reuse_variables()
        is_training = tf.placeholder(tf.bool)

        x = tf.placeholder(tf.float32, shape=[batch_size, input_size])

        y = ffnn_model(x, num_classes=1, batch_size=batch_size, is_training=is_training, reuse=reuse)

        g = tf.placeholder(tf.int32, shape=[batch_size, num_classes])

        return g, x, y, is_training

def train(batch_size=100,input_size = 100):
    scope = "main"

    g, x, y, is_training = model(batch_size, 1, input_size, scope,reuse=None )

    with tf.Session() as sess:
        train_step, accuracy,predictions = train_op(y, g), accuracy_op(y, g), predictions_op(y)
        cost_op = objective_function(y,g)
        init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer())
        sess.run(init_op)
        accs = []
        accs2 = []
        costs = []
        for i in range(10000):
            data = generate_data(batch_size, input_size, noise_strength)
            _,acc,cost = sess.run([train_step,accuracy,cost_op],feed_dict={x:data.obs,g:data.gold,is_training:True})
            acc2 = sess.run(accuracy, feed_dict={x: data.obs, g: data.gold, is_training: False})
            accs.append(acc)
            accs2.append(acc2)
            costs.append(cost)
            if i%100 == 0:
                # print scurrs
                print i,"with generated means (training = true}",np.mean(accs[-100:]),"with saved means {training = false}",np.mean(accs2[-100:])
                # print sess.run(predictions, feed_dict={x: data.obs, g: data.gold, is_training: False})
                vars = [var for var in tf.global_variables() if 'moving' in var.name]

                rv = sess.run(vars, {is_training: False})
                rt = sess.run(vars, {is_training: True})

                print"\t".join([str((v.name, a)) for a, v in zip(rv, vars)]), \
                    "\n", \
                    "\t".join([str((v.name, a)) for a, v in zip(rt, vars)])


if __name__ == "__main__":
    train()

Answer 1

批量标准化会创建必须运行的操作才能更新值。也就是说，它还将它们添加到特定集合中，如果您使用tf.contrib.layers.optimize_loss函数，它会为您收集这些函数并在运行此运算时运行它们。

所以要解决，请替换：

    train_step = AdamOptimizer().minimize(cost)

与

    train_step = optimize_loss(loss, step, learning_rate, optimizer='ADAM')

Tensorflow中的批量标准化层未更新其移动平均值和移动方差

1 个答案: