为避免过度拟合,我尝试在基于CIFAR-10数据集的CNN完全连接层中使用Dropout。我得到一个奇怪的结果。损失显着减少很快公平。但测试准确性根本没有改善。怎么了?任何帮助都非常感谢!见打印出来如下:
Generation # 5. Train loss: 543.70. Train acc (test acc): 14.00 (11.50)
Generation # 10. Train loss: 390.62. Train acc (test acc): 7.50 (11.50)
Generation # 15. Train loss: 286.08. Train acc (test acc): 13.50 (10.50)
Generation # 20. Train loss: 211.68. Train acc (test acc): 12.00 (11.00)
Generation # 25. Train loss: 180.75. Train acc (test acc): 7.50 (11.00)
Generation # 30. Train loss: 140.63. Train acc (test acc): 14.50 (17.00)
Generation # 35. Train loss: 123.40. Train acc (test acc): 17.00 (15.50)
Generation # 40. Train loss: 107.11. Train acc (test acc): 13.00 (11.50)
Generation # 45. Train loss: 96.01. Train acc (test acc): 16.50 (12.50)
Generation # 50. Train loss: 68.94. Train acc (test acc): 18.50 (15.00)
Generation # 55. Train loss: 65.62. Train acc (test acc): 12.00 (17.00)
Generation # 60. Train loss: 47.64. Train acc (test acc): 19.00 (18.00)
Generation # 65. Train loss: 33.38. Train acc (test acc): 21.00 (15.50)
Generation # 70. Train loss: 29.28. Train acc (test acc): 17.00 (14.00)
Generation # 75. Train loss: 22.45. Train acc (test acc): 13.00 (18.00)
Generation # 80. Train loss: 17.00. Train acc (test acc): 11.50 (14.00)
Generation # 85. Train loss: 10.91. Train acc (test acc): 10.50 (10.50)
Generation # 90. Train loss: 8.18. Train acc (test acc): 12.00 (9.50)
Generation # 95. Train loss: 7.07. Train acc (test acc): 10.50 (10.00)
Generation # 100. Train loss: 5.05. Train acc (test acc): 14.00 (15.50)
Generation # 105. Train loss: 3.97. Train acc (test acc): 14.00 (16.00)
Generation # 110. Train loss: 3.90. Train acc (test acc): 10.50 (4.50)
Generation # 115. Train loss: 3.83. Train acc (test acc): 11.50 (11.00)
Generation # 120. Train loss: 4.25. Train acc (test acc): 8.50 (10.50)
Generation # 125. Train loss: 3.28. Train acc (test acc): 6.50 (12.50)
Generation # 130. Train loss: 3.59. Train acc (test acc): 13.00 (8.00)
CNN的完整代码如下:
batch_size = 200
learning_rate = 0.0001
evaluation_size = 200
image_width = train_x[0].shape[0]
image_height = train_x[0].shape[1]
target_size = max(train_labels) + 1
num_channels = 3
generations = 20000
eval_every = 5
conv1_features = 32
conv2_features = 32
conv3_features = 64
max_pool_size1 = 2
max_pool_size2 = 2
max_pool_size3 = 2
fully_connected_size1 = 100
dropout_rate = 0.5
keep_prob = tf.placeholder(tf.float32)
x_input_shape = (batch_size, image_width, image_height, num_channels)
x_input = tf.placeholder(tf.float32, shape=x_input_shape)
y_target = tf.placeholder(tf.int32, shape=(batch_size))
eval_input_shape = (evaluation_size, image_width, image_height, num_channels)
eval_input = tf.placeholder(tf.float32, shape=eval_input_shape)
eval_target = tf.placeholder(tf.int32, shape=(evaluation_size))
conv1_weight = tf.Variable(tf.truncated_normal([5,5,num_channels,conv1_features], stddev=0.1, dtype=tf.float32))
conv1_bias = tf.Variable(tf.zeros([conv1_features], dtype=tf.float32))
conv2_weight = tf.Variable(tf.truncated_normal([5,5,conv1_features,conv2_features], stddev=0.1, dtype=tf.float32))
conv2_bias = tf.Variable(tf.zeros([conv2_features], dtype=tf.float32))
conv3_weight = tf.Variable(tf.truncated_normal([5,5,conv2_features,conv3_features], stddev=0.1, dtype=tf.float32))
conv3_bias = tf.Variable(tf.zeros([conv3_features], dtype=tf.float32))
resulting_width = image_width // (max_pool_size1 * max_pool_size2 * max_pool_size3)
resulting_height = image_height // (max_pool_size1 * max_pool_size2 * max_pool_size3)
full1_input_size = resulting_width * resulting_height * conv3_features
full1_weight = tf.Variable(tf.truncated_normal([full1_input_size,fully_connected_size1], stddev=0.1, dtype=tf.float32))
full1_bias = tf.Variable(tf.truncated_normal([fully_connected_size1], stddev=0.1, dtype=tf.float32))
full2_weight = tf.Variable(tf.truncated_normal([fully_connected_size1, target_size], stddev=0.1, dtype=tf.float32))
full2_bias = tf.Variable(tf.truncated_normal([target_size], stddev=0.1, dtype=tf.float32))
# define net
def my_conv_net(input_data):
# 1st conv relu maxpool layer
conv1 = tf.nn.conv2d(input_data, conv1_weight, strides=[1,1,1,1], padding='SAME')
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
max_pool1 = tf.nn.max_pool(relu1, ksize=[1,max_pool_size1,max_pool_size1,1],
strides=[1, max_pool_size1, max_pool_size1, 1], padding='SAME')
# 2nd conv relu maxpool
conv2 = tf.nn.conv2d(max_pool1, conv2_weight, strides=[1,1,1,1], padding='SAME')
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
max_pool2 = tf.nn.max_pool(relu2, ksize=[1,max_pool_size2,max_pool_size2,1],
strides=[1, max_pool_size2, max_pool_size2, 1], padding='SAME')
# 3nd conv relu maxpool
conv3 = tf.nn.conv2d(max_pool2, conv3_weight, strides=[1,1,1,1], padding='SAME')
relu3 = tf.nn.relu(tf.nn.bias_add(conv3, conv3_bias))
max_pool3 = tf.nn.max_pool(relu3, ksize=[1,max_pool_size3,max_pool_size3,1],
strides=[1, max_pool_size3, max_pool_size3, 1], padding='SAME')
# Transform output into 1xN layer for next fully connected layer
final_conv_shape = max_pool3.get_shape().as_list() # [batch_size/num of image, height, width, channel]
final_shape = final_conv_shape[1] * final_conv_shape[2] * final_conv_shape[3]
flat_output = tf.reshape(max_pool3, [final_conv_shape[0],final_shape])
# 1st fully connected layer
fully_connected1 = tf.nn.relu(tf.add(tf.matmul(flat_output, full1_weight), full1_bias))
fully_connected1_dropout = tf.nn.dropout(fully_connected1, keep_prob)
# 2nd fully connected layer
final_model_output = tf.add(tf.matmul(fully_connected1_dropout, full2_weight), full2_bias)
return final_model_output
# model output
model_output = my_conv_net(x_input)
test_model_output = my_conv_net(eval_input)
# loss, sparse, mean label has been int, not one hot.
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model_output, labels=y_target))
prediction = tf.nn.softmax(model_output)
test_prediction = tf.nn.softmax(test_model_output)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# create accuracy function
def get_accuracy(logits, targets):
batch_predictions = np.argmax(logits, axis=1)
num_correct = np.sum(np.equal(batch_predictions, targets))
return 100. * num_correct / batch_predictions.shape[0]
# initialize variables
init = tf.global_variables_initializer()
sess.run(init)
train_loss = []
train_acc = []
test_acc = []
for i in range(generations):
rand_index = np.random.choice(len(train_x), size=batch_size, replace=False)
rand_x = train_x[rand_index]
rand_y = train_labels[rand_index]
train_dict = {x_input: rand_x, y_target: rand_y, keep_prob: dropout_rate}
sess.run(train_step, feed_dict=train_dict)
temp_train_loss, temp_train_preds = sess.run([loss, prediction], feed_dict={x_input: rand_x, y_target: rand_y, keep_prob: 1})
temp_train_acc = get_accuracy(temp_train_preds, rand_y)
if (i+1) % eval_every == 0:
eval_index = np.random.choice(len(test_x), size=evaluation_size)
eval_x = test_x[eval_index]
eval_y = test_labels[eval_index]
test_dict = {eval_input: eval_x, eval_target: eval_y, keep_prob: 1}
test_preds = sess.run(test_prediction, feed_dict=test_dict)
temp_test_acc = get_accuracy(test_preds, eval_y)
# record and print results
train_loss.append(temp_train_loss)
train_acc.append(temp_train_acc)
test_acc.append(temp_test_acc)
acc_and_loss = [(i+1), temp_train_loss, temp_train_acc, temp_test_acc]
acc_and_loss = [np.round(x,2) for x in acc_and_loss]
print('Generation # {}. Train loss: {:.2f}. Train acc (test acc): {:.2f} ({:.2f})'.format(*acc_and_loss))
答案 0 :(得分:1)
你的模特肯定不会学习,因为即使训练准确性也没有提高。我无法在您的代码中发现明显的错误,因此看起来是调整超参数的时候了。我的建议:
0.01
,0.001
。很可能0.0001
太小了(它看起来不是太大,但谁知道)。stddev
:0.001
,0.0001
。我看过这个例子,这个参数是不学习的唯一原因。keep_prob
很适合对抗过度拟合,但在这种状态下你的模型甚至不会过度拟合。您可以将其设置为1,直到您至少有一些工作。3x3
更好,特别是当您没有那么多过滤器时(32
和64
)。如果这没有任何帮助,我建议您可视化每个层中激活的分布,渐变中的渐变或权重分布以缩小问题范围。