批量标准化不保存其移动平均值和移动方差
当我训练时,我的训练数据完全过度拟合(如预期的那样)。通过批量标准化,培训更快,也如预期的那样。但是,在训练步骤之后,我立即在相同数据上运行相同模型并使用“is_training”= False,这会产生非常低劣的结果。此外,每次我查看moving_mean和moving_variance时,它们都是默认值。他们从不更新。
(u'main/y/y/moving_mean:0', array([ 0., 0.], dtype=float32))
(u'main/y/y/moving_variance:0', array([ 1., 1.], dtype=float32)) \
(u'main/y/y/moving_mean:0', array([ 0., 0.], dtype=float32))
(u'main/y/y/moving_variance:0', array([ 1., 1.], dtype=float32))
700 with generated means (training = true} 1.0 with saved means {training = false} 0.4911
我有update_ops代码,但它似乎没有做到这一点。 update_collections = None使它起作用,但我被告知这是出于性能原因的次优解决方案。
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if update_ops:
updates = tf.group(*update_ops)
cost = with_dependencies([updates], cost)
我的代码在
下面import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected, softmax, batch_norm
from tensorflow.python.ops.control_flow_ops import with_dependencies
from tensorflow.python.training.adam import AdamOptimizer
batch_size = 100
input_size = 10
noise_strength = 4
class Data(object):
def __init__(self,obs,gold):
self.obs=obs
self.gold=gold
def generate_data(batch_size,input_size,noise_strength):
input = np.random.rand(batch_size, input_size) * noise_strength
gold = np.random.randint(0, 2, (input_size,1))
input = input + gold
return Data(input,gold)
def ffnn_model(inputs,num_classes,batch_size,is_training,reuse=False):
output = fully_connected(inputs,
num_classes * 2,
activation_fn=None,
normalizer_fn=batch_norm,
normalizer_params={'is_training': is_training, 'reuse': reuse, 'scope': 'y'},
reuse=reuse,
scope='y'
)
y = softmax(tf.reshape(output, [batch_size, num_classes, 2]))
return y
#objective function
def objective_function(y,gold):
indices = tf.stack([tf.range(tf.size(gold)),tf.reshape(gold,[-1])],axis=1)
scores = tf.gather_nd(tf.reshape(y,[-1,2]),indices=indices)
# return tf.cast(indices,tf.float32),-tf.reduce_mean(tf.log(scores+1e-6))
return -tf.reduce_mean(tf.log(scores+1e-6))
def train_op(y,gold):
cost = objective_function(y,gold)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if update_ops:
print "yes to update_ops"
print update_ops
updates = tf.group(*update_ops)
cost = with_dependencies([updates], cost)
train_step = AdamOptimizer().minimize(cost)
return train_step
def predictions_op(y):
return tf.cast(tf.argmax(y, axis=len(y.get_shape()) - 1), dtype=tf.int32)
def accuracy_op(y,gold):
return tf.reduce_mean(tf.cast(tf.equal(predictions_op(y), gold),tf.float32))
def model(batch_size, num_classes, input_size, scope, reuse):
with tf.variable_scope(scope) as m:
if reuse:
m.reuse_variables()
is_training = tf.placeholder(tf.bool)
x = tf.placeholder(tf.float32, shape=[batch_size, input_size])
y = ffnn_model(x, num_classes=1, batch_size=batch_size, is_training=is_training, reuse=reuse)
g = tf.placeholder(tf.int32, shape=[batch_size, num_classes])
return g, x, y, is_training
def train(batch_size=100,input_size = 100):
scope = "main"
g, x, y, is_training = model(batch_size, 1, input_size, scope,reuse=None )
with tf.Session() as sess:
train_step, accuracy,predictions = train_op(y, g), accuracy_op(y, g), predictions_op(y)
cost_op = objective_function(y,g)
init_op = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer())
sess.run(init_op)
accs = []
accs2 = []
costs = []
for i in range(10000):
data = generate_data(batch_size, input_size, noise_strength)
_,acc,cost = sess.run([train_step,accuracy,cost_op],feed_dict={x:data.obs,g:data.gold,is_training:True})
acc2 = sess.run(accuracy, feed_dict={x: data.obs, g: data.gold, is_training: False})
accs.append(acc)
accs2.append(acc2)
costs.append(cost)
if i%100 == 0:
# print scurrs
print i,"with generated means (training = true}",np.mean(accs[-100:]),"with saved means {training = false}",np.mean(accs2[-100:])
# print sess.run(predictions, feed_dict={x: data.obs, g: data.gold, is_training: False})
vars = [var for var in tf.global_variables() if 'moving' in var.name]
rv = sess.run(vars, {is_training: False})
rt = sess.run(vars, {is_training: True})
print"\t".join([str((v.name, a)) for a, v in zip(rv, vars)]), \
"\n", \
"\t".join([str((v.name, a)) for a, v in zip(rt, vars)])
if __name__ == "__main__":
train()
答案 0 :(得分:0)
批量标准化会创建必须运行的操作才能更新值。也就是说,它还将它们添加到特定集合中,如果您使用tf.contrib.layers.optimize_loss
函数,它会为您收集这些函数并在运行此运算时运行它们。
所以要解决,请替换:
train_step = AdamOptimizer().minimize(cost)
与
train_step = optimize_loss(loss, step, learning_rate, optimizer='ADAM')