重述问题

Question

我在使用张量标准化和tensorflow时遇到问题。我构建了以下模型：

def weight_variable(kernal_shape):
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
                        initializer=tf.truncated_normal_initializer(stddev=0.02))
    return weights
def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial)

# return 1 conv layer
def conv_layer(x, w_shape, b_shape, is_training, padding='SAME'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram("biases", b)

    # Note that I used a stride of 2 on purpose in order not to use max pool layer.
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
    conv = tf.contrib.layers.batch_norm(conv, scale=True, is_training=is_training)

    activations = tf.nn.relu(conv)

    tf.summary.histogram("activations", activations)

    return activations

# return deconv layer
def deconv_layer(x, w_shape, b_shape, is_training, padding="SAME", activation='relu'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram('biases', b)

    x_shape = tf.shape(x)
    # output shape: [batch_size, h * 2, w * 2, input_shape from w].
    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
    # Note that I have used a stride of 2 since I used a stride of 2 in conv layer.

    conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
    conv_trans = tf.contrib.layers.batch_norm(conv_trans, scale=True, is_training=is_training)

    if activation == 'relu':
        transposed_activations = tf.nn.relu(conv_trans)
    else:
        transposed_activations = tf.nn.sigmoid(conv_trans)

    tf.summary.histogram("transpose_activation", transposed_activations)
    return transposed_activations

def model(input):
    with tf.variable_scope('conv1'):
        conv1 = conv_layer(input, [4, 4, 3, 32], [32], is_training=phase_train)  # image size: [56, 56]
    with tf.variable_scope('conv2'):
        conv2 = conv_layer(conv1, [4, 4, 32, 64], [64], is_training=phase_train)  # image size: [28, 28]
    with tf.variable_scope('conv3'):
        conv3 = conv_layer(conv2, [4, 4, 64, 128], [128], is_training=phase_train)  # image size: [14, 14]
    with tf.variable_scope('conv4'):
        conv4 = conv_layer(conv3, [4, 4, 128, 256], [256], is_training=phase_train)  # image size: [7, 7]
        conv4_reshaped = tf.reshape(conv4, [batch_size * num_participants, 7 * 7 * 256], name='conv4_reshaped')

    w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
    b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
    w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
    b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
    epsilon = tf.random_normal([1, latent_dim])

    tf.summary.histogram('weights_c_mu', w_c_mu)
    tf.summary.histogram('biases_c_mu', b_c_mu)
    tf.summary.histogram('weights_c_sig', w_c_sig)
    tf.summary.histogram('biases_c_sig', b_c_sig)

    with tf.variable_scope('mu'):
        mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
        tf.summary.histogram('mu', mu)

    with tf.variable_scope('stddev'):
        stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
        tf.summary.histogram('stddev', stddev)

    with tf.variable_scope('z'):
        # This formula was adopted from the following paper: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7979344
        latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
        tf.summary.histogram('features_sig', stddev)

    with tf.variable_scope('GRU'):
        print(latent_var.get_shape().as_list())
        latent_var = tf.reshape(latent_var, shape=[int(batch_size / 100)* num_participants, time_steps, latent_dim])

        cell = tf.nn.rnn_cell.GRUCell(cell_size)   # state_size of cell_size.
        H, C = tf.nn.dynamic_rnn(cell, latent_var, dtype=tf.float32)  # H size: [batch_size * num_participants, SEQLEN, cell_size]
        H = tf.reshape(H, [batch_size * num_participants, cell_size])

    with tf.variable_scope('output'):
        # output layer.
        w_output = tf.Variable(tf.truncated_normal([cell_size, 1], mean=0, stddev=0.01, dtype=tf.float32, name='w_output'))
        tf.summary.histogram('w_output', w_output)
        b_output = tf.get_variable('b_output', shape=[1], dtype=tf.float32,
                                   initializer=tf.constant_initializer(0.0))
        predictions = tf.add(tf.matmul(H, w_output), b_output, name='softmax_output')
        tf.summary.histogram('output', predictions)

        var_list = [v for v in tf.global_variables() if 'GRU' in v.name]
        var_list.append([w_output, b_output])

    return predictions, var_list

另外，我正在恢复模型参数如下：

saver_torestore = tf.train.Saver()

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(events_path, sess.graph)
    merged = tf.summary.merge_all()

    to_run_list = [merged, RMSE]

    # Initialize `iterator` with training data.
    sess.run(init_op)

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
    # under tensorboard_logs.
    ckpt = tf.train.get_checkpoint_state(
        os.path.dirname(model_path))
    if ckpt and ckpt.model_checkpoint_path:
        saver_torestore.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoints are saved!!!')
    else:
        print('No stored checkpoints')

    counter = 0
    for _ in range(num_epoch):
        sess.run(iterator.initializer)
        print('epoch:', _)

        # This while loop will run indefinitly until the end of the first epoch
        while True:
            try:
                summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: False})

                print('loss: ' + str(loss_))

                losses.append(loss_)
                counter += 1

                train_writer.add_summary(summary, counter)

            except tf.errors.OutOfRangeError:
                print('error, ignore ;) ')
                break

     print('average losses:', np.average(losses))
     train_writer.close()

我确保保存变量。所以我运行了以下命令：

def assign_values_to_batchNorm():
    vars = [v for v in tf.global_variables() if "BatchNorm" in v.name and "Adam" not in v.name]
    file_names = [(v.name[:-2].replace("/", "_") + ".txt") for v in vars]
    for var, file_name in zip(vars, file_names):
        lst = open(file_name).read().split(";")[:-1]
        print(lst)
        values = list(map(np.float32, lst))
        tf.assign(var, values)

请注意，我已使用此方法手动恢复移动平均值和移动方差值。但我得到了同样的结果。

我在会话下调用了assign_values_to_batchNorm（）。我有一些值=＆gt;似乎移动平均线，移动方差，伽马和斗鱼都被保存了。

现在请注意我正在使用Windows 10，我有张量版本1.3。

因此，每当我在会话下运行summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: True})时，在初始化/恢复所有变量之后，我得到了0.022的RMSE，这与训练模型结束时获得的错误相同。现在，如果我将phase_train设置为false，我的RMSE为0.038。请注意，我只是在测试网络。因此，即使我使用训练数据集进行测试，但我的目的只是在训练/测试时测试网络的行为。所以这对我来说太奇怪了。请注意，阶段是占位符。我在代码中有如下内容：

phase_train = tf.placeholder(dtype=tf.bool, name='phase')

此外，以下是优化程序的代码段：

with tf.name_scope('optimizer'):
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(0.00001).minimize(RMSE)

主要问题：当phase = False时RMSE = 0.038，当phase = True时为0.022。

非常感谢任何帮助!!

Answer 1

重述问题

看起来您关心的是：除了phase_train的值之外，您保持一切相同（训练数据，批次数，训练数据的时期数，初始化等）。当phase_train=True时，进行训练产生的RMSE为0.022，当phase_train=False时，进行训练产生的RMSE为0.038，并且您认为无论phase_train的值如何，RMSE都应该是相同（0.022或0.038）。如果那不是你的意思，请告诉我。

回答问题

这里的答案是，当phase_train=True与phase_train=False时，RMSE应该有所不同。让我们看看为什么会这样。

您可以像这样设置图表：

conv1 = conv_layer(input, [4, 4, 3, 32], [32], is_training=phase_train)

然后，如果我们查看conv_layer(...)函数中的代码，您可以像这样使用is_training变量：

conv = tf.contrib.layers.batch_norm(conv, scale=True, is_training=is_training)

现在，让我们看一下tf.contrib.layers.batch_norm（https://www.tensorflow.org/api_docs/python/tf/contrib/layers/batch_norm）的文档：

is_training：图层是否处于训练模式。在培训中模式它将累积时刻的统计数据 moving_mean和moving_variance使用指数移动平均线随着给定的衰变。如果它不在训练模式，那么它会使用moving_mean和moving_variance的值。

从文档中可以看出，is_training=True 会导致is_training=False时出现不同的功能。具体来说，当is_training=True时，使用衰减计算归一化常数，而当is_training=False时，没有衰减。当您切换is_training的值时，您的代码会执行不同的操作，因此您的RMSE错误会有所不同。

如果遇到类似这样的问题，查看Tensorflow文档可能有助于解释意外结果。此外，我们不建议在某处将is_training标记设置为False来训练您的模型。

为什么在is_training = True时使用衰减？

您可能想知道为什么Tensorflow会在is_training=True时添加衰减。答案是，当您进行训练时，神经网络中的权重正在更新以变得越来越好。这意味着您之前更新的均值和方差非常不准确，以后更新的均值和方差非常准确。由于早期更新不准确，因此您希望减少对神经网络权重应如何更新的说法，因此每次后续更新都会减少0.999。

例如，当is_training=True时，从第一次更新到权重的均值和方差与第10,000次更新的均值和方差一样重要0.999^10000 ~ 0.000045。这是有道理的，因为在您的第一次更新时，您的权重基本上是随机的，并且绝对不会产生与第10000次更新的均值和方差有意义的均值和方差。

当is_training=False时，这意味着您告诉Tensorflow您已经在神经网络中学到了适当的权重。你告诉Tensorflow你已经训练了所有东西，权重是有意义的，你在批量规范中得到的均值和方差是有意义的。所以，没有必要腐烂任何东西。

此解释与您的RMSE错误一致。如果你使用is_training=False进行训练，那么通过对初始化神经网络的随机权重赋予更多的意义，你效率低下，所以你的最终模型不会那么好。正如您所观察到的，is_training=False运行时的RMSE错误高于is_training=True运行时的RMSE错误。

Answer 2

所以我认为使用批量规范化层可能存在问题。所以我创建了一个简单的模型并在MNIST数据集上训练它。因此，我们有2个场景，在第一种情况下，使用批量规范训练模型，第2，在没有批量规范的情况下训练它。

现在，如果我们比较测试结果，无论是否有批量规范，我们都会看到在使用BN时我们获得更高的准确性或更低的损失。请记住，包含BN的模型在测试阶段时设置为false。因此，我们可以得出结论，拥有BN的模型比没有BN的模型更好。

其次，如果我们考虑使用批量标准化训练的模型。现在，如果我们比较测试集上的损失（同时将阶段设置为True，另一方面设置为False ），我们得出结论，当将阶段设置为True时，我们可以获得更好的结果。因为，直观地说，使用当前批次的统计数据比训练数据集的统计数据更准确。

总之，我的问题出现在使用批量标准化训练模型，并在将阶段设置为True时测试模型，然后是False。因此，在将阶段设置为true而不是false时，我们肯定会获得更好的损失（更低）。

如何使用具有张量流的BatchNormalization？

2 个答案:

重述问题

回答问题

为什么在is_training = True时使用衰减？