Question

我训练了一个vgg模型来分类具有1000个类别（ID）的面部数据集的子集。我用

图像尺寸为224x224，
tf.losses.softmax_cross_entropy
AdamOptimizer

但是，模型的损失收敛到一个非零值（7.116〜7.117）。

我的代码：（不显示数据读取功能）

def create_model(images, label_indices, class_num=1000):
    onehot_labels = tf.one_hot(label_indices, class_num)
    with tf.name_scope("vgg_network"):
        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/slim/python/slim/nets/vgg.py
        logits, endpoints = vgg_16(images, num_classes=class_num,global_pool=True)
    with tf.name_scope("softmax_cross_entropy_loss"):
        #loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_indices, logits=logits))
        loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
    with tf.name_scope("accuracy"):
        accuracy = tf.metrics.accuracy(labels=label_indices, predictions=tf.argmax(logits, axis=1))

    with tf.name_scope("training_network"):
        tvars = [var for var in tf.trainable_variables()]
        optim = tf.train.AdamOptimizer(a.lr, a.beta1)
        grads_and_vars = optim.compute_gradients(loss, var_list=tvars, colocate_gradients_with_ops=True)
        train_op = optim.apply_gradients(grads_and_vars)

        with tf.control_dependencies([train_op]):
            ema = tf.train.ExponentialMovingAverage(decay=0.99)
            update_losses = ema.apply([loss])

        global_step = tf.train.get_or_create_global_step()
        incr_global_step = tf.assign(global_step, global_step+1)
    return Model(
        logits=logits,
        onehot_labels=onehot_labels,
        accuracy=accuracy,
        label_indices=label_indices,
        loss=ema.average(loss),
        #loss=loss,
        grads_and_vars=None,
        train=tf.group(update_losses, incr_global_step),
        )

def main():
    ################ check directory #########################################
    if not os.path.exists(a.output_dir):    
        os.makedirs(a.output_dir)
    ################ read TFRecord dataset ###################################
    input_batch, iterator  = read_tfrecord()

    ################ creat model  ############################################
    model = create_model(input_batch.images, input_batch.labels, a.class_num)

    ################ configuration ###########################################
    logdir = a.output_dir
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    ############### session ######################################################
    with sv.managed_session(config=sess_config) as sess:
        max_steps = 2**32
        start = time.time()
        fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }
        fetches["label_indices"] = model.label_indices
        fetches["loss"] = model.loss
        fetches["accuracy"] = model.accuracy
        for step in range(max_steps):
            results = sess.run(fetches)

            if step % 49 == 0:
                train_epoch = math.ceil(results["global_step"] / input_batch.steps_per_epoch)
                train_step = (results["global_step"] - 1) % input_batch.steps_per_epoch + 1
                rate = (step + 1) * a.batch_size / (time.time() - start)
                remaining = (max_steps - step) * a.batch_size / rate
                print("progress  epoch %d  step %d  image/sec %0.1f  remaining %dm" % (train_epoch, train_step, rate, remaining / 60))
                print("loss", results["loss"])
                print("accuracy", results["accuracy"])

我得到的结果是：

progress  epoch 0  step 450242  image/sec 4.3  remaining 133877094m
loss 6.9084997
accuracy (0.0, 0.0)
progress  epoch 1  step 49  image/sec 26.3  remaining 21736389m
loss 6.6496363
accuracy (0.56377554, 0.5525)
progress  epoch 1  step 98  image/sec 27.7  remaining 20691865m
loss 6.3198533
accuracy (0.36862245, 0.36489898)
progress  epoch 1  step 147  image/sec 28.2  remaining 20324250m
loss 6.1094604
accuracy (0.28146258, 0.2795608)

... .....

loss curve

精度一直在下降（实际上降为零），损耗最终收敛到约7.116。

模型收敛后，我检查了VGG的输出对数，得到的输出对数在-20.6152和-21.026之间。

我的代码有什么问题？如有必要，我会添加数据读取代码。

为什么在vgg模型中tf.losses.softmax_cross_entropy收敛到非零值（7.116）？

0 个答案: