在CIFAR-10上运行3层convnet的第一个训练时期时,我既无法达到足够高的验证准确性,也无法最小化目标函数。
具体来说,准确性在第一次迭代中会发生变化,然后在随后的迭代中稳定在8.7%。奇怪的是,我还训练了一个2层,全连接的网络,该网络的性能要好得多,在验证集上始终可以获得约43%的准确性。
注意:大部分代码来自Jupyter笔记本,该笔记本旨在作为斯坦福CS231n Convolutional Neural Networks for Visual Recognition作业的一部分提供的准系统Tensorflow(和Keras)的介绍,尽管我是<他们既不是本课程的学生,也不是大学的学生,我这样做纯粹是出于体验目的,并且出于我对简历/深度学习的初衷。 我的贡献只是实现正向传递和初始化网络参数。
笔记本的作者发表了一条评论,指出正确实施此模型后,该模型应在第一个时期后达到40%以上的精度,而无需进行任何超参数调整。
实施说明
49,000 / 1000:训练/验证拆分,批大小= 64
使用Kaiming归一化初始化重量,使用0s初始化偏差
学习率= 3e-3
下面详细介绍了convnet的每一层:
带有16个3x3滤波器,零填充1的ReLU卷积层(带偏置)
ReLU全连接层(带有偏差)可计算10个类别的分数
代码
(我的写在“ TODO”注释块之间)
import tensorflow as tf
import numpy as np
def load_cifar10(num_training=49000, num_validation=1000, num_test=10000):
cifar10 = tf.keras.datasets.cifar10.load_data()
(X_train, y_train), (X_test, y_test) = cifar10
X_train = np.asarray(X_train, dtype=np.float32)
y_train = np.asarray(y_train, dtype=np.int32).flatten()
X_test = np.asarray(X_test, dtype=np.float32)
y_test = np.asarray(y_test, dtype=np.int32).flatten()
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
mean_pixel = X_train.mean(axis=(0, 1, 2), keepdims=True)
std_pixel = X_train.std(axis=(0, 1, 2), keepdims=True)
X_train = (X_train - mean_pixel) / std_pixel
X_val = (X_val - mean_pixel) / std_pixel
X_test = (X_test - mean_pixel) / std_pixel
return X_train, y_train, X_val, y_val, X_test, y_test
class Dataset(object):
def __init__(self, X, y, batch_size, shuffle=False):
assert X.shape[0] == y.shape[0], 'Got different numbers of data and labels'
self.X, self.y = X, y
self.batch_size, self.shuffle = batch_size, shuffle
def __iter__(self):
N, B = self.X.shape[0], self.batch_size
idxs = np.arange(N)
if self.shuffle:
np.random.shuffle(idxs)
return iter((self.X[i:i+B], self.y[i:i+B]) for i in range(0, N, B))
def flatten(x):
N = tf.shape(x)[0]
return tf.reshape(x, (N, -1))
def three_layer_convnet(x, params):
conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer ConvNet. #
############################################################################
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1 = tf.nn.relu(h1_conv)
h2_conv = tf.nn.conv2d(h1,
conv_w2 + conv_b2,
strides=[1, 1, 1, 1],
padding='SAME'
)
h2 = tf.nn.relu(h2_conv)
fc_params = flatten(fc_w + fc_b)
h2 = flatten(h2)
scores = tf.matmul(h2, fc_params)
############################################################################
# END OF YOUR CODE #
############################################################################
return scores
def training_step(scores, y, params, learning_rate):
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
loss = tf.reduce_mean(losses)
grad_params = tf.gradients(loss, params)
new_weights = []
for w, grad_w in zip(params, grad_params):
new_w = tf.assign_sub(w, learning_rate * grad_w)
new_weights.append(new_w)
with tf.control_dependencies(new_weights):
return tf.identity(loss)
def check_accuracy(sess, dset, x, scores, is_training=None):
num_correct, num_samples = 0, 0
for x_batch, y_batch in dset:
feed_dict = {x: x_batch, is_training: 0}
scores_np = sess.run(scores, feed_dict=feed_dict)
y_pred = scores_np.argmax(axis=1)
num_samples += x_batch.shape[0]
num_correct += (y_pred == y_batch).sum()
acc = float(num_correct) / num_samples
print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))
def kaiming_normal(shape):
if len(shape) == 2:
fan_in, fan_out = shape[0], shape[1]
elif len(shape) == 4:
fan_in, fan_out = np.prod(shape[:3]), shape[3]
return tf.random_normal(shape) * np.sqrt(2.0 / fan_in)
def three_layer_convnet_init():
params = None
############################################################################
# TODO: Initialize the parameters of the three-layer network. #
############################################################################
conv_w1 = tf.Variable(kaiming_normal((5, 5, 3, 32)))
conv_b1 = tf.Variable(tf.zeros((32,)))
conv_w2 = tf.Variable(kaiming_normal((3, 3, 32, 16)))
conv_b2 = tf.Variable(tf.zeros((16,)))
fc_w = tf.Variable(kaiming_normal((32 * 32 * 16, 10)))
fc_b = tf.Variable(tf.zeros((10,)))
params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
############################################################################
# END OF YOUR CODE #
############################################################################
return params
def main():
learning_rate = 3e-3
tf.reset_default_graph()
is_training = tf.placeholder(tf.bool, name='is_training')
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()
train_dset = Dataset(X_train, y_train, batch_size=64, shuffle=True)
test_dset = Dataset(X_test, y_test, batch_size=64)
val_dset = Dataset(X_val, y_val, batch_size=64, shuffle=False)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape, y_train.dtype)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
device = '/cpu:0'
with tf.device(device):
x = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int32, [None])
params = three_layer_convnet_init()
scores = three_layer_convnet(x, params)
loss = training_step(scores, y, params, learning_rate)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for t, (x_np, y_np) in enumerate(train_dset):
feed_dict = {x: x_np, y: y_np}
loss_np = sess.run(loss, feed_dict=feed_dict)
if t % 100 == 0:
print('Iteration %d, loss = %.4f' % (t, loss_np))
check_accuracy(sess, val_dset, x, scores, is_training)
if __name__=="__main__":
main()
编辑:删除了不必要的注释和代码
答案 0 :(得分:0)
问题在这里
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
这是错误的,因为在这里您要将偏置值(conv_b1)添加到滤波器conv_w1,但必须将偏置添加到conv层的输出中。正确的方法应该是这样
h1_conv = tf.nn.conv2d(x,
conv_w1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1_bias = tf.nn.bias_add(h1_conv, conv_b1)
h1 = tf.nn.relu(h1_bias)
也将其校正为h2。