我尝试自己使用基本的tensorflow python API实现Alexnet,但是在训练过程中,我得到了一些意外的输出,并且测试的准确性非常低。
训练数据集:oxford17
这是我的代码:
import tensorflow as tf
import tflearn.datasets.oxflower17 as oxflower17
from IPython import embed
# network trainning metavariables
learning_rate = 0.001
batch_size = 64
log_dir = './tflog/alexnet/'
# define input size
image_width = 224
image_height = 224
image_depth = 3
num_labels = 17
# initialize datasets
print('-----Dataset initialize start-----')
ox17_dataset, ox17_labels = oxflower17.load_data(one_hot=True)
train_dataset = ox17_dataset[:1024, :, :, :]
train_labels = ox17_labels[:1024, :]
test_dataset = ox17_dataset[1024:, :, :, :]
test_lables = ox17_labels[1024:, :]
print('Training dataset size is {}'.format(train_dataset.shape))
print('Test dataset size is {}'.format(test_dataset.shape))
print('-----Dataset initialize complete-----')
# initialize all weights and variables
def get_alexnet_variables(output_class_num):
with tf.name_scope('conv1'):
w1 = tf.Variable(tf.truncated_normal(shape=[11, 11, 3, 96], stddev=0.1), name='w1')
b1 = tf.Variable(tf.zeros([96]), name='b1')
with tf.name_scope('conv2'):
w2 = tf.Variable(tf.truncated_normal(shape=[5, 5, 96, 256], stddev=0.1), name='w2')
b2 = tf.Variable(tf.constant(1.0, shape=[256]), name='b2')
with tf.name_scope('conv3'):
w3 = tf.Variable(tf.truncated_normal(shape=[3, 3, 256, 384], stddev=0.1), name='w3')
b3 = tf.Variable(tf.zeros([384]), name='b3')
with tf.name_scope('conv4'):
w4 = tf.Variable(tf.truncated_normal(shape=[3, 3, 384, 384], stddev=0.1), name='w4')
b4 = tf.Variable(tf.constant(1.0, shape=[384]), name='b4')
with tf.name_scope('conv5'):
w5 = tf.Variable(tf.truncated_normal(shape=[3, 3, 384, 256], stddev=0.1), name='w5')
b5 = tf.Variable(tf.zeros([256]), name='b5')
with tf.name_scope('fc6'):
w6 = tf.Variable(tf.truncated_normal(shape=[(224 // 2 ** 5) * (224 // 2 ** 5) * 256, 4096], stddev=0.1),
name='w6')
b6 = tf.Variable(tf.constant(1.0, shape=[4096]), name='b6')
with tf.name_scope('fc7'):
w7 = tf.Variable(tf.truncated_normal(shape=[4096, 4096], stddev=0.1), name='w7')
b7 = tf.Variable(tf.constant(1.0, shape=[4096]), name='b7')
with tf.name_scope('fc8'):
w8 = tf.Variable(tf.truncated_normal(shape=[4096, output_class_num], stddev=0.1), name='w8')
b8 = tf.Variable(tf.constant(1.0, shape=[output_class_num]), name='b8')
variables = dict(w1=w1, w2=w2, w3=w3, w4=w4, w5=w5, w6=w6, w7=w7, w8=w8, b1=b1, b2=b2, b3=b3, b4=b4, b5=b5, b6=b6,
b7=b7, b8=b8)
return variables
# build alexnet
def alexnet(variables, input_data, keep_prob_for_net):
# layer one : conventional layer
with tf.name_scope('conv1'):
conv1 = tf.nn.conv2d(input_data, variables['w1'], strides=[1, 4, 4, 1], padding='SAME')
conv1 = tf.nn.bias_add(conv1, variables['b1'])
conv1 = tf.nn.relu(conv1)
conv1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
conv1 = tf.nn.local_response_normalization(conv1)
# layer two : conventional layer
with tf.name_scope('conv2'):
conv2 = tf.nn.conv2d(conv1, variables['w2'], strides=[1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.bias_add(conv2, variables['b2'])
conv2 = tf.nn.relu(conv2)
conv2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
conv2 = tf.nn.local_response_normalization(conv2)
# layer three: conventional layer
with tf.name_scope('conv3'):
conv3 = tf.nn.conv2d(conv2, variables['w3'], strides=[1, 1, 1, 1], padding='SAME')
conv3 = tf.nn.bias_add(conv3, variables['b3'])
conv3 = tf.nn.relu(conv3)
# layer four: conventional layer
with tf.name_scope('conv4'):
conv4 = tf.nn.conv2d(conv3, variables['w4'], strides=[1, 1, 1, 1], padding='SAME')
conv4 = tf.nn.bias_add(conv4, variables['b4'])
conv4 = tf.nn.relu(conv4)
# layer five: conventional layer
with tf.name_scope('conv5'):
conv5 = tf.nn.conv2d(conv4, variables['w5'], strides=[1, 1, 1, 1], padding='SAME')
conv5 = tf.nn.bias_add(conv5, variables['b5'])
conv5 = tf.nn.relu(conv5)
conv5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
conv5 = tf.nn.local_response_normalization(conv5)
# flat data
with tf.name_scope('flatten'):
flatten = tf.reshape(conv5, shape=[-1, (224 // 2 ** 5) * (224 // 2 ** 5) * 256])
# layer six: fully connected layer
with tf.name_scope('fc6'):
fc6 = tf.matmul(flatten, variables['w6']) + variables['b6']
fc6 = tf.nn.tanh(fc6)
fc6 = tf.nn.dropout(fc6, keep_prob=keep_prob_for_net)
# layer seven: fully connected layer
with tf.name_scope('fc7'):
fc7 = tf.matmul(fc6, variables['w7']) + variables['b7']
fc7 = tf.nn.tanh(fc7)
fc7 = tf.nn.dropout(fc7, keep_prob=keep_prob_for_net)
# layer eight: fully connected layer
with tf.name_scope('fc8'):
logits_output = tf.matmul(fc7, variables['w8']) + variables['b8']
return logits_output
# define placeholder, loss and accuracy
network_input = tf.placeholder(tf.float32, shape=[None, 224, 224, 3])
true_labels = tf.placeholder(tf.float32, shape=[None, 17])
keep_prob = tf.placeholder(tf.float32)
# network output
y_ = alexnet(get_alexnet_variables(17), network_input, keep_prob)
# cross entropy loss
cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_labels, logits=y_))
tf.summary.scalar('cross entropy', cross_entropy_loss)
# training step
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy_loss)
# calculate accuracy
correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(true_labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
# run training process
with tf.Session() as sess:
print('-----Training Start-----')
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph)
test_writer = tf.summary.FileWriter(log_dir + '/test')
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
i = 0
for j in range(1000):
i += 1
if i * batch_size > 1024:
i = 1
print('Train step {} running {}/1024 to {}/1024'.format(j, (i - 1) * batch_size, i * batch_size))
batch_image = train_dataset[(i - 1) * batch_size:i * batch_size, :, :, :]
batch_label = train_labels[(i - 1) * batch_size:i * batch_size, :]
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
# train_step.run(feed_dict={network_input: batch_image, true_labels: batch_label, keep_prob: 0.5})
summary, _ = sess.run([merged, train_step],
feed_dict={network_input: batch_image, true_labels: batch_label, keep_prob: 0.5},
run_metadata=run_metadata, options=run_options)
train_writer.add_summary(summary, j)
if j % 10 == 0:
print('Train Step {}, Current Test Accuracy is {}'.format(j, accuracy.eval(feed_dict={network_input:batch_image,
true_labels: batch_label,
keep_prob: 1.0})))
print('-----Training Complete-----')
print('-----Test Start-----')
print('Test Accuracy is {}'.format(
accuracy.eval(feed_dict={network_input: test_dataset, true_labels: test_lables, keep_prob: 1.0})))
print('-----Test Complete-----')
在训练过程中,张量板显示了准确性和交叉熵。结果如下:
的结果如您所见,精度在值1/17左右上下反弹,该值等于随机选择精度。同时,交叉熵损失大于10。
当我使用tflearn/alexnet中的代码比较结果时,损耗从1.7降低到0.9,准确度从0.1提高到0.9。所以,我必须在这里做错什么。
经过1000次迭代,测试精度为0.476,低于随机选择。
我检查了模型定义,变量声明和丢失,但找不到我得到意外结果的原因。
我还尝试将学习率更改为0.1或将批处理大小更改为10/64/128,但没有任何改变。
在此先感谢任何人的帮助。
答案 0 :(得分:0)
正如@ThomasPinetz指出的那样,学习率太高导致了此问题。
将学习率从0.001更改为1E-5,并将训练迭代次数从1000扩展到15000后,我从张量板得到以下结果:
精度: accuracy
交叉熵损失: cross entropy
培训后,测试准确性为 0.7410 ,无需进行微调即可接受。