Question

我一直在使用张量流来微调resnet_v2-101以解决一个分离问题。每次我从检查点恢复我的模型，损失会急剧增加而且精确度也会下降，这将持续很长一段时间才会丢失再慢慢下降。

在我的培训框架下，初始模型存在同样的问题。

值得一提的是，当我从预训练模型中恢复时，这种情况不会发生（损失将一直持续下降）。

这让我困扰了很长一段时间。谢谢你。

以下是我的一些代码。

with def_graph.as_default() as graph:
    # map class name to numbers using tf.contrib.lookup.index_table_from_tensor
    ....
    # create dataset using slim.dataset_data_provider.DatasetDataProvider
    ....

    batch_images, batch_labels = train_dataset.create_dataset()

    with tf.device('/gpu:0'):
        train_op, accum_op, zero_op, global_step, metrics_op, variables_to_restore, pred_op, lr, accuracy, total_loss = train_step(batch_images, batch_labels)

    summary_op = tf.summary.merge_all()

    pre_train_saver = tf.train.Saver(variables_to_restore)
    # Define an init function that loads the pretrained checkpoint.
    # sess is the managed session passed by Supervisor
    def load_pretrain(sess):
        pre_train_saver.restore(sess, PRETRAINED_MODEL_PATH)

    init_op = tf.group(tf.global_variables_initializer())
    sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain, init_op = init_op, summary_op = None, save_model_secs=8000, checkpoint_basename='resnet101_v2_model.ckpt')

    config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
    with sv.managed_session(config=config) as sess:
        for step in range(int(num_steps_per_epoch * NUM_EPOCHES)):         
            if sv.should_stop():
                break

            start_time = time.time()

            with tf.device('/gpu:0'):
                # accumulate gradient to get bigger batch_size
                sess.run(zero_op)
                for _ in range(1, ACCUMULATE_STEP):
                    sess.run([accum_op, total_loss])
                _, _, _, cur_loss, cur_acc, total_step, cur_lr = sess.run([train_op, accum_op, metrics_op, total_loss, accuracy, global_step, lr])
            time_elapsed = time.time() - start_time

        sv.saver.save(sess, sv.save_path, global_step = sv.global_step)

这里是train_step代码：

def train_step(input_examples, one_hot_labels):   
    with slim.arg_scope(resnet2.resnet_arg_scope()):
        logits, end_points = resnet2.resnet_v2_101(input_examples, NUM_CLASS, is_training=True)

        variables_to_restore = slim.get_variables_to_restore(exclude = ['resnet_v2_101/logits'])

        end_points['logits_output_squeezed'] = tf.squeeze(logits)
        loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = end_points['logits_output_squeezed'], label_smoothing = 0.1)
        total_loss = tf.losses.get_total_loss()    # obtain the regularization losses as well

        global_step = tf.train.get_or_create_global_step(graph = graph)

        lr = my_exponential_decay(#tf.train.exponential_decay(
            learning_rate = initial_learning_rate,
            global_step = global_step,
            decay_steps = decay_steps,
            decay_rate = learning_rate_decay_factor,
            staircase = True)

        optimizer = tf.train.AdamOptimizer(learning_rate = lr)

        moving_average_variables = slim.get_model_variables()
        variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay, global_step)

        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables))

        accumulate_factor = tf.constant([1./ACCUMULATE_STEP])
        train_op, accum_ops, zero_ops = my_create_train_op(total_loss, optimizer, False, accumulate_factor)

        predictions = tf.argmax(tf.squeeze(end_points['predictions']), 1)
        probabilities = end_points['predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1), name='train_accuracy')
        metrics_op = tf.group(accuracy_update)

        return train_op, accum_ops, zero_ops, global_step, metrics_op, variables_to_restore, predictions, lr, accuracy, total_loss

渐变累积。

def my_create_train_op(total_loss, optimizer, summarize_gradients = False, accumulate_factor=None):
    global_step = tf.train.get_or_create_global_step()

    update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))

    if update_ops:
        with ops.control_dependencies(update_ops):
            barrier = control_flow_ops.no_op(name='update_barrier')
    total_loss = control_flow_ops.with_dependencies([barrier], total_loss)

    variables_to_train = tf_variables.trainable_variables()

    # initialized with 0s
    accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in variables_to_train]
    zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]

    # Calls the compute_gradients function of the optimizer to obtain... the list of gradients
    grads = optimizer.compute_gradients(
      total_loss,
      variables_to_train,
      gate_gradients=tf_optimizer.Optimizer.GATE_OP,
      aggregation_method=None,
      colocate_gradients_with_ops=False)

    ## Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and grads are in the same order)
    if accumulate_factor is not None:   
        total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')
        with tf.control_dependencies([total_loss]):
            accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(grads) if gv[0] is not None]

        ## Define the training step (part with variable value update)
        accumulate_grads = [(tf.multiply(accum_vars[i], accumulate_factor), gv[1]) for i, gv in enumerate(grads) if gv[0] is not None]
    else:
        accum_ops = tf.no_op(name = 'accum_pass_by')

    if accumulate_factor is not None: 
        # Summarize gradients.
        if summarize_gradients:
            with ops.name_scope('summarize_grads'):
                add_gradients_summaries(accumulate_grads)
        grad_updates = optimizer.apply_gradients(accumulate_grads, global_step=global_step)
    else:
        # Summarize gradients.
        if summarize_gradients:
            with ops.name_scope('summarize_grads'):
                add_gradients_summaries(grads)
        grad_updates = optimizer.apply_gradients(grads, global_step=global_step)

    with ops.name_scope('train_op'):
        # Ensure the train_tensor computes grad_updates.
        train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)

    # Add the operation used for training to the 'train_op' collection
    train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
    if train_op not in train_ops:
        train_ops.append(train_op)

    return train_op, accum_ops, zero_ops

为什么每次我使用张量流从我的检查点重新开始时，损失总是会增加并且准确度会急剧下降？

0 个答案: