我训练了一个vgg模型来分类具有1000个类别(ID)的面部数据集的子集。我用
图像尺寸为224x224,
tf.losses.softmax_cross_entropy
AdamOptimizer
但是,模型的损失收敛到一个非零值(7.116〜7.117)。
我的代码:(不显示数据读取功能)
def create_model(images, label_indices, class_num=1000):
onehot_labels = tf.one_hot(label_indices, class_num)
with tf.name_scope("vgg_network"):
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/nets/vgg.py
logits, endpoints = vgg_16(images, num_classes=class_num,global_pool=True)
with tf.name_scope("softmax_cross_entropy_loss"):
#loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_indices, logits=logits))
loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
with tf.name_scope("accuracy"):
accuracy = tf.metrics.accuracy(labels=label_indices, predictions=tf.argmax(logits, axis=1))
with tf.name_scope("training_network"):
tvars = [var for var in tf.trainable_variables()]
optim = tf.train.AdamOptimizer(a.lr, a.beta1)
grads_and_vars = optim.compute_gradients(loss, var_list=tvars, colocate_gradients_with_ops=True)
train_op = optim.apply_gradients(grads_and_vars)
with tf.control_dependencies([train_op]):
ema = tf.train.ExponentialMovingAverage(decay=0.99)
update_losses = ema.apply([loss])
global_step = tf.train.get_or_create_global_step()
incr_global_step = tf.assign(global_step, global_step+1)
return Model(
logits=logits,
onehot_labels=onehot_labels,
accuracy=accuracy,
label_indices=label_indices,
loss=ema.average(loss),
#loss=loss,
grads_and_vars=None,
train=tf.group(update_losses, incr_global_step),
)
def main():
################ check directory #########################################
if not os.path.exists(a.output_dir):
os.makedirs(a.output_dir)
################ read TFRecord dataset ###################################
input_batch, iterator = read_tfrecord()
################ creat model ############################################
model = create_model(input_batch.images, input_batch.labels, a.class_num)
################ configuration ###########################################
logdir = a.output_dir
sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
############### session ######################################################
with sv.managed_session(config=sess_config) as sess:
max_steps = 2**32
start = time.time()
fetches = {
"train": model.train,
"global_step": sv.global_step,
}
fetches["label_indices"] = model.label_indices
fetches["loss"] = model.loss
fetches["accuracy"] = model.accuracy
for step in range(max_steps):
results = sess.run(fetches)
if step % 49 == 0:
train_epoch = math.ceil(results["global_step"] / input_batch.steps_per_epoch)
train_step = (results["global_step"] - 1) % input_batch.steps_per_epoch + 1
rate = (step + 1) * a.batch_size / (time.time() - start)
remaining = (max_steps - step) * a.batch_size / rate
print("progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60))
print("loss", results["loss"])
print("accuracy", results["accuracy"])
我得到的结果是:
progress epoch 0 step 450242 image/sec 4.3 remaining 133877094m
loss 6.9084997
accuracy (0.0, 0.0)
progress epoch 1 step 49 image/sec 26.3 remaining 21736389m
loss 6.6496363
accuracy (0.56377554, 0.5525)
progress epoch 1 step 98 image/sec 27.7 remaining 20691865m
loss 6.3198533
accuracy (0.36862245, 0.36489898)
progress epoch 1 step 147 image/sec 28.2 remaining 20324250m
loss 6.1094604
accuracy (0.28146258, 0.2795608)
... .....
精度一直在下降(实际上降为零),损耗最终收敛到约7.116。
模型收敛后,我检查了VGG的输出对数,得到的输出对数在-20.6152和-21.026之间。
我的代码有什么问题? 如有必要,我会添加数据读取代码。