卷积神经网络:训练问题,为什么偏差阵列是通过训练而不是权重矩阵进化的?

时间:2017-11-06 16:47:05

标签: tensorflow conv-neural-network

我想测试具有2个转换层和两个完全连接层的卷积nn的实现。简单的神经网络模型对我来说很好,但是当我添加转换层时我遇到了问题。最初我想调整不同的超参数以优化模型的性能。 为了尝试理解为什么培训不起作用(验证准确度保持在0.1左右),我还通过TensorBoard添加了可视化。

当我运行以下代码时,只有一组超参数,模型并不是真正的训练,因为准确性永远不会增加。但是,我能够看到TensorBoard我的所有变量都被初始化了,并且更新了biaises,但没有更新不同层的权重矩阵。

这就是我对TensorBoard的看法:

result of the training

我真的不明白为什么模型会努力更新权重。我知道它有时可以来自初始化,但我认为我使用了正确的选项,对吧?

如果你知道这个bug会在哪里,我真的很感兴趣!

PS:代码不是最优雅的,但是当我看到它没有工作时,我希望它尽可能简单

from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

LOGDIR = 'tensorboard_claire/tuning2'

patch_size = 5
kernel_size = 2
depth = 16
num_hidden = 64

def generate_hyperparameters():
# Randomly choose values for the hyperparameters.
    return {"learning_rate": 10 ** np.random.uniform(-3, -1),
            "batch_size": np.random.randint(1, 100),
            "dropout": np.random.uniform(0, 1),
            "stddev": 10 ** np.random.uniform(-4, 2)}

pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save  # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

image_size = 28
num_labels = 10
num_channels = 1 # grayscale

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size, image_size, 
    num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

def conv_layer(data, weights, biases):    
  conv = tf.nn.conv2d(data, weights, [1, 2, 2, 1], padding='SAME')
  hidden = tf.nn.relu(conv + biases)
  pool = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')

  return pool

def reshape_drop(data):
  shape = data.get_shape().as_list()
  reshape = tf.reshape(data, [shape[0], shape[1] * shape[2] * shape[3]])
  return reshape

def train_cnn_and_compute_accuracy(hyperparameters, name='train'):
# Construct a deep network, train it, and return the accuracy on the
# validation data.
  batch_size = hyperparameters["batch_size"]
  std = hyperparameters["stddev"]

  graph = tf.Graph()
  with graph.as_default():   
    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)

    # Variables

    weights = {
       'conv1' : tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=std), name='convw1'),
       'conv2' : tf.Variable(tf.random_normal([patch_size, patch_size, depth, depth], stddev=std), name='convw2'),
       'fc1' : tf.Variable(tf.random_normal([2 * 2 * depth, num_hidden], stddev=std), name='fcw1'),
       'fc2' : tf.Variable(tf.random_normal([num_hidden, num_labels], stddev=std), name='fcw2')
       }

   biases = {
       'conv1' : tf.Variable(tf.zeros([depth]), name='convb1'),
       'conv2' : tf.Variable(tf.constant(1.0, shape=[depth]), name='convb2'),
       'fc1' : tf.Variable(tf.constant(1.0, shape=[num_hidden]), name='fcb1'),
       'fc2' : tf.Variable(tf.constant(1.0, shape=[num_labels]), name='fcb2')
       }

   # Neural network model with 2 convolutional layers and 2 fully connected layers
   # with max pooling and dropout

   with tf.name_scope("1st_conv_layer"):
       conv_1_train = conv_layer(tf_train_dataset, weights['conv1'], biases['conv1'])
       conv_1_valid = conv_layer(tf_valid_dataset, weights['conv1'], biases['conv1'])

       tf.summary.histogram("convw1", weights['conv1'])
       tf.summary.histogram("convb1", biases['conv1'])

    with tf.name_scope("2nd_conv_layer"):
        conv_2_train = conv_layer(conv_1_train, weights['conv2'], biases['conv2'])
        conv_2_valid = conv_layer(conv_1_valid, weights['conv2'], biases['conv2'])

        tf.summary.histogram("convw2", weights['conv2'])
        tf.summary.histogram("convb2", biases['conv2'])

    with tf.name_scope('dropout'):
        dropped_train = tf.nn.dropout(conv_2_train, hyperparameters["dropout"])
        dropped_valid = tf.nn.dropout(conv_2_valid, hyperparameters["dropout"])
        reshape_train = reshape_drop(dropped_train)
        reshape_valid = reshape_drop(dropped_valid)

    with tf.name_scope("1st_fc_layer"):
        fc1_train = tf.nn.relu(tf.matmul(reshape_train, weights['fc1']) + biases['fc1'])
        fc1_valid = tf.nn.relu(tf.matmul(reshape_valid, weights['fc1']) + biases['fc1'])

        tf.summary.histogram("fcw1", weights['fc1'])
        tf.summary.histogram("fcb1", biases['fc1'])

    with tf.name_scope("2nd_fc_layer"):
        fc2_train = tf.nn.relu(tf.matmul(fc1_train, weights['fc2']) + biases['fc2'])
        fc2_valid = tf.nn.relu(tf.matmul(fc1_valid, weights['fc2']) + biases['fc2'])

        tf.summary.histogram("fcw2", weights['fc2'])
        tf.summary.histogram("fcb2", biases['fc2'])

    # Predictions

    logits = fc2_train
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(fc2_valid)

    # Loss with or without regularization
    with tf.name_scope('xentropy'):
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
        tf.summary.scalar("xent", loss)

    # Decaying learning rate and GradientDescent optimizer

    with tf.name_scope('train'):
        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(hyperparameters["learning_rate"], global_step, 100, 0.96, staircase=True)
        tf.summary.scalar("learning_rate", learning_rate)
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)

    with tf.name_scope("valid_accuracy"):
        correct_prediction = tf.equal(tf.argmax(valid_prediction, 1), tf.argmax(valid_labels, 1))
    #Casts a tensor to a new type.
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar("valid_accuracy", accuracy)

    num_steps = 1001
    val_acc = 0

    with tf.Session(graph=graph) as session:
        summ = tf.summary.merge_all()
        tf.global_variables_initializer().run()
        writer = tf.summary.FileWriter(LOGDIR+"/"+make_hparam_string(hyperparameters))
        writer.add_graph(session.graph)

        for step in range(num_steps):
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions, summary = session.run([optimizer, loss, train_prediction, summ], feed_dict=feed_dict)

            if step in np.arange(0, num_steps, 70):
                print("Current step: " + str(step))
                val_acc = accuracy.eval()
                print("Validation accuracy : " + str(val_acc))

            if step % 5 == 0:
                writer.add_summary(summary, step)

    return val_acc

    session.close()
    writer.close()

def make_hparam_string(h):
    learning_rate = h["learning_rate"]
    batch_size = h["batch_size"]
    dropout = h["dropout"]
    stddev = h["stddev"]
    return ("lr_" + str(learning_rate) + ",dp_" + str(dropout) + ",batch_size_" + str(batch_size) + ",stddev_" + str(stddev))

# Generate a bunch of hyperparameter configurations.
hyperparameter_configurations = [generate_hyperparameters() for _ in range(5)]

# Launch some experiments.
results = []
for hyperparameters in hyperparameter_configurations:
    print("Hyperparameters : ", hyperparameters.values())
    acc = train_cnn_and_compute_accuracy(hyperparameters)
    results.append(acc)

1 个答案:

答案 0 :(得分:3)

代码有点混乱,但无论如何,100的标准是巨大的,它应该在0.1左右。接下来的事情是你不应该在soft max之前的最后一层使用relu(或任何其他激活函数)。然后辍学限制也很宽,如果你想保留它们,至少尝试删除辍学并确保网络可以在没有它的情况下进行训练(如果你随机获得0.1,你的权重很难得到更新)并在之后返回。
首先尝试解决这个问题,如果它没有帮助,我们可以仔细观察。

相关问题