Question

为避免过度拟合，我尝试在基于CIFAR-10数据集的CNN完全连接层中使用Dropout。我得到一个奇怪的结果。损失显着减少很快公平。但测试准确性根本没有改善。怎么了？任何帮助都非常感谢！见打印出来如下：

Generation # 5. Train loss: 543.70. Train acc (test acc): 14.00 (11.50)
Generation # 10. Train loss: 390.62. Train acc (test acc): 7.50 (11.50)
Generation # 15. Train loss: 286.08. Train acc (test acc): 13.50 (10.50)
Generation # 20. Train loss: 211.68. Train acc (test acc): 12.00 (11.00)
Generation # 25. Train loss: 180.75. Train acc (test acc): 7.50 (11.00)
Generation # 30. Train loss: 140.63. Train acc (test acc): 14.50 (17.00)
Generation # 35. Train loss: 123.40. Train acc (test acc): 17.00 (15.50)
Generation # 40. Train loss: 107.11. Train acc (test acc): 13.00 (11.50)
Generation # 45. Train loss: 96.01. Train acc (test acc): 16.50 (12.50)
Generation # 50. Train loss: 68.94. Train acc (test acc): 18.50 (15.00)
Generation # 55. Train loss: 65.62. Train acc (test acc): 12.00 (17.00)
Generation # 60. Train loss: 47.64. Train acc (test acc): 19.00 (18.00)
Generation # 65. Train loss: 33.38. Train acc (test acc): 21.00 (15.50)
Generation # 70. Train loss: 29.28. Train acc (test acc): 17.00 (14.00)
Generation # 75. Train loss: 22.45. Train acc (test acc): 13.00 (18.00)
Generation # 80. Train loss: 17.00. Train acc (test acc): 11.50 (14.00)
Generation # 85. Train loss: 10.91. Train acc (test acc): 10.50 (10.50)
Generation # 90. Train loss: 8.18. Train acc (test acc): 12.00 (9.50)
Generation # 95. Train loss: 7.07. Train acc (test acc): 10.50 (10.00)
Generation # 100. Train loss: 5.05. Train acc (test acc): 14.00 (15.50)
Generation # 105. Train loss: 3.97. Train acc (test acc): 14.00 (16.00)
Generation # 110. Train loss: 3.90. Train acc (test acc): 10.50 (4.50)
Generation # 115. Train loss: 3.83. Train acc (test acc): 11.50 (11.00)
Generation # 120. Train loss: 4.25. Train acc (test acc): 8.50 (10.50)
Generation # 125. Train loss: 3.28. Train acc (test acc): 6.50 (12.50)
Generation # 130. Train loss: 3.59. Train acc (test acc): 13.00 (8.00)

CNN的完整代码如下：

batch_size = 200
learning_rate = 0.0001
evaluation_size = 200
image_width = train_x[0].shape[0]
image_height = train_x[0].shape[1]
target_size = max(train_labels) + 1
num_channels = 3
generations = 20000
eval_every = 5
conv1_features = 32
conv2_features = 32
conv3_features = 64
max_pool_size1 = 2
max_pool_size2 = 2
max_pool_size3 = 2
fully_connected_size1 = 100
dropout_rate = 0.5

keep_prob = tf.placeholder(tf.float32)
x_input_shape = (batch_size, image_width, image_height, num_channels)
x_input = tf.placeholder(tf.float32, shape=x_input_shape)
y_target = tf.placeholder(tf.int32, shape=(batch_size))
eval_input_shape = (evaluation_size, image_width, image_height, num_channels)
eval_input = tf.placeholder(tf.float32, shape=eval_input_shape)
eval_target = tf.placeholder(tf.int32, shape=(evaluation_size))

conv1_weight = tf.Variable(tf.truncated_normal([5,5,num_channels,conv1_features], stddev=0.1, dtype=tf.float32))
conv1_bias = tf.Variable(tf.zeros([conv1_features], dtype=tf.float32))
conv2_weight = tf.Variable(tf.truncated_normal([5,5,conv1_features,conv2_features], stddev=0.1, dtype=tf.float32))
conv2_bias = tf.Variable(tf.zeros([conv2_features], dtype=tf.float32))
conv3_weight = tf.Variable(tf.truncated_normal([5,5,conv2_features,conv3_features], stddev=0.1, dtype=tf.float32))
conv3_bias = tf.Variable(tf.zeros([conv3_features], dtype=tf.float32))

resulting_width = image_width // (max_pool_size1 * max_pool_size2 * max_pool_size3)
resulting_height = image_height // (max_pool_size1 * max_pool_size2 * max_pool_size3)
full1_input_size = resulting_width * resulting_height * conv3_features
full1_weight = tf.Variable(tf.truncated_normal([full1_input_size,fully_connected_size1], stddev=0.1, dtype=tf.float32))
full1_bias = tf.Variable(tf.truncated_normal([fully_connected_size1], stddev=0.1, dtype=tf.float32))
full2_weight = tf.Variable(tf.truncated_normal([fully_connected_size1, target_size], stddev=0.1, dtype=tf.float32))
full2_bias = tf.Variable(tf.truncated_normal([target_size], stddev=0.1, dtype=tf.float32))

# define net
def my_conv_net(input_data):
    # 1st conv relu maxpool layer
    conv1 = tf.nn.conv2d(input_data, conv1_weight, strides=[1,1,1,1], padding='SAME')
    relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
    max_pool1 = tf.nn.max_pool(relu1, ksize=[1,max_pool_size1,max_pool_size1,1], 
            strides=[1, max_pool_size1, max_pool_size1, 1], padding='SAME')
    # 2nd conv relu maxpool
    conv2 = tf.nn.conv2d(max_pool1, conv2_weight, strides=[1,1,1,1], padding='SAME')
    relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
    max_pool2 = tf.nn.max_pool(relu2, ksize=[1,max_pool_size2,max_pool_size2,1], 
            strides=[1, max_pool_size2, max_pool_size2, 1], padding='SAME')
    # 3nd conv relu maxpool
    conv3 = tf.nn.conv2d(max_pool2, conv3_weight, strides=[1,1,1,1], padding='SAME')
    relu3 = tf.nn.relu(tf.nn.bias_add(conv3, conv3_bias))
    max_pool3 = tf.nn.max_pool(relu3, ksize=[1,max_pool_size3,max_pool_size3,1], 
            strides=[1, max_pool_size3, max_pool_size3, 1], padding='SAME')
    # Transform output into 1xN layer for next fully connected layer
    final_conv_shape = max_pool3.get_shape().as_list()      # [batch_size/num of image, height, width, channel]
    final_shape = final_conv_shape[1] * final_conv_shape[2] * final_conv_shape[3]
    flat_output = tf.reshape(max_pool3, [final_conv_shape[0],final_shape])
    # 1st fully connected layer
    fully_connected1 = tf.nn.relu(tf.add(tf.matmul(flat_output, full1_weight), full1_bias))
    fully_connected1_dropout = tf.nn.dropout(fully_connected1, keep_prob)
    # 2nd fully connected layer
    final_model_output = tf.add(tf.matmul(fully_connected1_dropout, full2_weight), full2_bias)

    return final_model_output

# model output
model_output = my_conv_net(x_input)
test_model_output = my_conv_net(eval_input)    

# loss, sparse, mean label has been int, not one hot.
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model_output, labels=y_target))

prediction = tf.nn.softmax(model_output)
test_prediction = tf.nn.softmax(test_model_output)

train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# create accuracy function
def get_accuracy(logits, targets):
    batch_predictions = np.argmax(logits, axis=1)
    num_correct = np.sum(np.equal(batch_predictions, targets))
    return 100. * num_correct / batch_predictions.shape[0]

# initialize variables
init = tf.global_variables_initializer()
sess.run(init)

train_loss = []
train_acc = []
test_acc = []
for i in range(generations):
    rand_index = np.random.choice(len(train_x), size=batch_size, replace=False)
    rand_x = train_x[rand_index] 
    rand_y = train_labels[rand_index]
    train_dict = {x_input: rand_x, y_target: rand_y, keep_prob: dropout_rate}
    sess.run(train_step, feed_dict=train_dict)
    temp_train_loss, temp_train_preds = sess.run([loss, prediction], feed_dict={x_input: rand_x, y_target: rand_y, keep_prob: 1})
    temp_train_acc = get_accuracy(temp_train_preds, rand_y)
    if (i+1) % eval_every == 0:
        eval_index = np.random.choice(len(test_x), size=evaluation_size)
        eval_x = test_x[eval_index]
        eval_y = test_labels[eval_index]
        test_dict = {eval_input: eval_x, eval_target: eval_y, keep_prob: 1}
        test_preds = sess.run(test_prediction, feed_dict=test_dict)
        temp_test_acc = get_accuracy(test_preds, eval_y)
        # record and print results
        train_loss.append(temp_train_loss)
        train_acc.append(temp_train_acc)
        test_acc.append(temp_test_acc)
        acc_and_loss = [(i+1), temp_train_loss, temp_train_acc, temp_test_acc]
        acc_and_loss = [np.round(x,2) for x in acc_and_loss]
        print('Generation # {}. Train loss: {:.2f}. Train acc (test acc): {:.2f} ({:.2f})'.format(*acc_and_loss))

Answer 1

你的模特肯定不会学习，因为即使训练准确性也没有提高。我无法在您的代码中发现明显的错误，因此看起来是调整超参数的时候了。我的建议：

尝试不同的学习率：0.01，0.001。很可能0.0001太小了（它看起来不是太大，但谁知道）。
尝试不同的初始化stddev：0.001，0.0001。我看过这个例子，这个参数是不学习的唯一原因。
可以用零来初始化偏差。
以较大的辍学概率开始。小keep_prob很适合对抗过度拟合，但在这种状态下你的模型甚至不会过度拟合。您可以将其设置为1，直到您至少有一些工作。
我也开始使用较小的过滤器尺寸，例如： 3x3更好，特别是当您没有那么多过滤器时（32和64）。
我不确定第二个FC层是否有帮助。考虑删除它，直到你看到网络学习。
顺便说一句，即使在排除故障后，tune hyperparameters找到最佳的查找通常也会有所帮助。这会为你带来一些额外的准确性，虽然它在这一点上很关键。

如果这没有任何帮助，我建议您可视化每个层中激活的分布，渐变中的渐变或权重分布以缩小问题范围。

Dropout基于CIFAR-10数据集为Convolutional NN带来了奇怪的结果

1 个答案: