SELU的批量归一化与自标准化神经网络的区别

时间:2017-07-15 19:45:44

标签: neural-network batch-normalization

我想知道批量标准化和自标准化神经网络之间的区别。换句话说,SELU(Scaled Exponential Linear Unit)会取代批量标准化吗?

此外,我在查看SELU激活的值后,它们在[-1, 1]范围内。虽然批量标准化不是这种情况。相反,BN图层之后(在relu激活之前)的值取[-a, a]近似值,而不是[-1, 1]

以下是我在SELU激活后和批量规范层之后打印值的方法:

batch_norm_layer = tf.Print(batch_norm_layer,
                           data=[tf.reduce_max(batch_norm_layer), tf.reduce_min(batch_norm_layer)],
                           message = name_scope + ' min and max') 

SELU激活的类似代码......

批量规范层定义如下:

def batch_norm(x, n_out, phase_train, in_conv_layer = True):

    with tf.variable_scope('bn'):
        beta = tf.Variable(tf.constant(0.0, shape=n_out),
                                     name='beta', trainable=True)
        gamma = tf.Variable(tf.constant(1.0, shape=n_out),
                                      name='gamma', trainable=True)
        if in_conv_layer:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
        else:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1], name='moments')

        ema = tf.train.ExponentialMovingAverage(decay=0.9999)

        def mean_var_with_update():
            ema_apply_op = ema.apply([batch_mean, batch_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(batch_mean), tf.identity(batch_var)

        mean, var = tf.cond(phase_train,
                            mean_var_with_update,
                            lambda: (ema.average(batch_mean), ema.average(batch_var)))
        normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
    return normed

因此,由于批次规范输出更高的值,损失急剧增加,因此我得到了nans。

此外,我尝试用批量规范降低学习率,但是,这也没有帮助。那么如何解决这个问题???

以下是代码:

import tensorflow as tf
import numpy as np
import os
import cv2

batch_size = 32
num_epoch = 102
latent_dim = 100

def weight_variable(kernal_shape):
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
                        initializer=tf.truncated_normal_initializer(stddev=0.02))
    return weights

def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial)

def batch_norm(x, n_out, phase_train, convolutional = True):
    with tf.variable_scope('bn'):
        exp_moving_avg = tf.train.ExponentialMovingAverage(decay=0.9999)

        beta = tf.Variable(tf.constant(0.0, shape=n_out),
                                     name='beta', trainable=True)
        gamma = tf.Variable(tf.constant(1.0, shape=n_out),
                                      name='gamma', trainable=True)
        if convolutional:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')

        else:
            batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')

        update_moving_averages = exp_moving_avg.apply([batch_mean, batch_var])

        m = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_mean), lambda: batch_mean)
        v = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_var), lambda: batch_var)

        normed = tf.nn.batch_normalization(x, m, v, beta, gamma, 1e-3)
        normed = tf.Print(normed, data=[tf.shape(normed)], message='size of normed?')
    return normed, update_moving_averages   # Note that we should run the update_moving_averages with sess.run...

def conv_layer(x, w_shape, b_shape, padding='SAME'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram("biases", b)

    # Note that I used a stride of 2 on purpose in order not to use max pool layer.
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
    conv_batch_norm, update_moving_averages = batch_norm(conv, b_shape, phase_train=tf.cast(True, tf.bool))
    name_scope = tf.get_variable_scope().name

    conv_batch_norm = tf.Print(conv_batch_norm,
                               data=[tf.reduce_max(conv_batch_norm), tf.reduce_min(conv_batch_norm)],
                               message = name_scope + ' min and max')

    activations = tf.nn.relu(conv_batch_norm)
    tf.summary.histogram("activations", activations)

    return activations, update_moving_averages

def deconv_layer(x, w_shape, b_shape, padding="SAME", activation='selu'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram('biases', b)

    x_shape = tf.shape(x)

    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
    if activation == 'selu':
        conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
        conv_trans_batch_norm, update_moving_averages = \
            batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
        transposed_activations = tf.nn.relu(conv_trans_batch_norm)

    else:
        conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
        conv_trans_batch_norm, update_moving_averages = \
            batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
        transposed_activations = tf.nn.sigmoid(conv_trans_batch_norm)

    tf.summary.histogram("transpose_activation", transposed_activations)
    return transposed_activations, update_moving_averages

tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
    serialized_example,
    # Defaults are not specified since both keys are required.
    features={
        'height': tf.FixedLenFeature([], tf.int64),
        'width': tf.FixedLenFeature([], tf.int64),
        'image_raw': tf.FixedLenFeature([], tf.string),
        'annotation_raw': tf.FixedLenFeature([], tf.string)
    })

# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)

# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])

annotation = tf.cast(features['annotation_raw'], tf.string)

min_after_dequeue = 100
num_threads = 1
capacity = min_after_dequeue + num_threads * batch_size
label_batch, images_batch = tf.train.batch([annotation, image],
                                           shapes=[[], [112, 112, 3]],
                                           batch_size=batch_size,
                                           capacity=capacity,
                                           num_threads=num_threads)

label_batch_splitted = tf.string_split(label_batch, delimiter=',')
label_batch_values = tf.reshape(label_batch_splitted.values, [batch_size, -1])
label_batch_numbers = tf.string_to_number(label_batch_values, out_type=tf.float32)
confidences = tf.slice(label_batch_numbers, begin=[0, 2], size=[-1, 1])

images_batch = tf.cast([images_batch], tf.float32)[0]  # Note that casting the image will increases its rank.

with tf.name_scope('image_normal'):
    images_batch = tf.map_fn(lambda img: tf.image.per_image_standardization(img), images_batch)
    #images_batch = tf.Print(images_batch, data=[tf.reduce_max(images_batch), tf.reduce_min(images_batch)],
    #                        message='min and max in images_batch')
with tf.variable_scope('conv1'):
    conv1, uma_conv1 = conv_layer(images_batch, [4, 4, 3, 32], [32])      # image size: [56, 56]
with tf.variable_scope('conv2'):
    conv2, uma_conv2 = conv_layer(conv1, [4, 4, 32, 64], [64])     # image size: [28, 28]
with tf.variable_scope('conv3'):
    conv3, uma_conv3 = conv_layer(conv2, [4, 4, 64, 128], [128])   # image size: [14, 14]
with tf.variable_scope('conv4'):
    conv4, uma_conv4 = conv_layer(conv3, [4, 4, 128, 256], [256])  # image size: [7, 7]
    conv4_reshaped = tf.reshape(conv4, [-1, 7 * 7 * 256], name='conv4_reshaped')

w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
epsilon = tf.random_normal([1, latent_dim])

tf.summary.histogram('weights_c_mu', w_c_mu)
tf.summary.histogram('biases_c_mu', b_c_mu)
tf.summary.histogram('weights_c_sig', w_c_sig)
tf.summary.histogram('biases_c_sig', b_c_sig)

with tf.variable_scope('mu'):
    mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
    tf.summary.histogram('mu', mu)

with tf.variable_scope('stddev'):
    stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
    tf.summary.histogram('stddev', stddev)

with tf.variable_scope('z'):
    latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
    tf.summary.histogram('features_sig', stddev)

w_dc = tf.Variable(tf.truncated_normal([latent_dim, 7 * 7 * 256], stddev=0.1), name='weights_dc')
b_dc = tf.Variable(tf.constant(0.0, shape=[7 * 7 * 256]), name='biases_dc')
tf.summary.histogram('weights_dc', w_dc)
tf.summary.histogram('biases_dc', b_dc)

with tf.variable_scope('deconv4'):
    deconv4 = tf.nn.bias_add(tf.matmul(latent_var, w_dc), b_dc)
    deconv4_batch_norm, uma_deconv4 = \
        batch_norm(deconv4, [7 * 7 * 256], phase_train=tf.cast(True, tf.bool), convolutional=False)

    deconv4 = tf.nn.relu(deconv4_batch_norm)
    deconv4_reshaped = tf.reshape(deconv4, [-1, 7, 7, 256], name='deconv4_reshaped')

with tf.variable_scope('deconv3'):
    deconv3, uma_deconv3 = deconv_layer(deconv4_reshaped, [3, 3, 128, 256], [128], activation='selu')
with tf.variable_scope('deconv2'):
    deconv2, uma_deconv2 = deconv_layer(deconv3, [3, 3, 64, 128], [64], activation='selu')
with tf.variable_scope('deconv1'):
    deconv1, uma_deconv1 = deconv_layer(deconv2, [3, 3, 32, 64], [32], activation='selu')
with tf.variable_scope('deconv_image'):
    deconv_image_batch, uma_deconv = deconv_layer(deconv1, [3, 3, 3, 32], [3], activation='sigmoid')

# loss function.
with tf.name_scope('loss_likelihood'):
    # temp1 shape: [32, 112, 112, 3]

    temp1 = images_batch * tf.log(deconv_image_batch + 1e-9) + (1 - images_batch) * tf.log(1 - deconv_image_batch + 1e-9)

    #temp1 = temp1 * confidences. This will give an error. Therefore, we should expand the dimension of confidence tensor
    confidences_ = tf.expand_dims(tf.expand_dims(confidences, axis=1), axis=1) # shape: [32, 1, 1, 1].
    temp1 = temp1 * confidences_
    log_likelihood = -tf.reduce_sum(temp1, reduction_indices=[1, 2, 3])
    log_likelihood_total = tf.reduce_sum(log_likelihood)
    #l2_loss = tf.reduce_mean(tf.abs(tf.subtract(images_batch, deconv_image_batch)))

with tf.name_scope('loss_KL'):
    # temp2 shape: [32, 200]
    temp2 = 1 + tf.log(tf.square(stddev + 1e-9)) - tf.square(mu) - tf.square(stddev)
    temp3 = temp2 * confidences     # confidences shape is [32, 1]
    KL_term = - 0.5 * tf.reduce_sum(temp3, reduction_indices=1)
    KL_term_total = tf.reduce_sum(KL_term)

with tf.name_scope('total_loss'):
    variational_lower_bound = tf.reduce_mean(log_likelihood + KL_term)
    tf.summary.scalar('loss', variational_lower_bound)
with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(0.00001).minimize(variational_lower_bound)

init_op = tf.group(tf.local_variables_initializer(),
                   tf.global_variables_initializer())

saver = tf.train.Saver()

model_path = 'C:/Users/user/PycharmProjects/VariationalAutoEncoder/' \
             'VariationalAutoEncoderFaces/tensorboard_logs/Graph_model/ckpt'

# Here is the session...
with tf.Session() as sess:

    train_writer = tf.summary.FileWriter('C:/Users/user/PycharmProjects/VariationalAutoEncoder/'
                                         'VariationalAutoEncoderFaces/tensorboard_logs/Event_files', sess.graph)

    merged = tf.summary.merge_all()

    # Note that init_op should start before the Coordinator and the thread otherwise, this will throw an error.
    sess.run(init_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    step = 0

    to_run_list = [uma_conv1, uma_conv2, uma_conv3, uma_conv4, uma_deconv1, uma_deconv2, uma_deconv3,
                   uma_deconv4, uma_deconv, optimizer, variational_lower_bound, merged,
                   deconv_image_batch, image]

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
    # under tensorboard_logs.
    ckpt = tf.train.get_checkpoint_state(
        os.path.dirname(model_path))
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoints are saved!!!')
    else:
        print('No stored checkpoints')
    epoch = 0
    while not coord.should_stop():

        _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, loss, summary, reconstructed_image, original_image = \
            sess.run(to_run_list)

        print('total loss:', loss)

        original_image = cv2.cvtColor(np.array(original_image), cv2.COLOR_RGB2BGR)
        reconstructed_image = cv2.cvtColor(np.array(reconstructed_image[0]), cv2.COLOR_RGB2BGR)

        cv2.imshow('original_image', original_image)
        cv2.imshow('reconstructed_image', reconstructed_image)
        cv2.waitKey(1)
        if step % 234 == 0:
            epoch += 1
            print('epoch:', epoch)
            if epoch == num_epoch - 2:
                coord.request_stop()

        if step % 100 == 0:
            train_writer.add_summary(summary, step)
            #print('total loss:', loss)
            #print('log_likelihood_', log_likelihood_)
            #print('KL_term', KL_term_)
        step += 1

    save_path = saver.save(sess, model_path)
    coord.request_stop()
    coord.join(threads)
    train_writer.close()

非常感谢任何帮助!!

1 个答案:

答案 0 :(得分:0)

以下是一些示例代码,用于显示3个SELU图层的均值和方差的趋势。层(包括输入层)上的节点数是[15,30,30,8]

for (int i = 0; i < 25; i++)
{
    string panelID = "panel" + i.ToString();
    panelID.Visible = true;
}

这是一个可能的输出。超过3层,平均值和标准偏差仍然分别接近0和1。

fs.readFileSync(cliJsonPath, 'utf8')