我一直在使用张量流来微调resnet_v2-101以解决一个分离问题。每次我从检查点恢复我的模型,损失会急剧增加而且精确度也会下降,这将持续很长一段时间才会丢失再慢慢下降。
在我的培训框架下,初始模型存在同样的问题。
值得一提的是,当我从预训练模型中恢复时,这种情况不会发生(损失将一直持续下降)。
这让我困扰了很长一段时间。谢谢你。
以下是我的一些代码。
with def_graph.as_default() as graph:
# map class name to numbers using tf.contrib.lookup.index_table_from_tensor
....
# create dataset using slim.dataset_data_provider.DatasetDataProvider
....
batch_images, batch_labels = train_dataset.create_dataset()
with tf.device('/gpu:0'):
train_op, accum_op, zero_op, global_step, metrics_op, variables_to_restore, pred_op, lr, accuracy, total_loss = train_step(batch_images, batch_labels)
summary_op = tf.summary.merge_all()
pre_train_saver = tf.train.Saver(variables_to_restore)
# Define an init function that loads the pretrained checkpoint.
# sess is the managed session passed by Supervisor
def load_pretrain(sess):
pre_train_saver.restore(sess, PRETRAINED_MODEL_PATH)
init_op = tf.group(tf.global_variables_initializer())
sv = tf.train.Supervisor(logdir=LOG_PATH, init_fn = load_pretrain, init_op = init_op, summary_op = None, save_model_secs=8000, checkpoint_basename='resnet101_v2_model.ckpt')
config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
with sv.managed_session(config=config) as sess:
for step in range(int(num_steps_per_epoch * NUM_EPOCHES)):
if sv.should_stop():
break
start_time = time.time()
with tf.device('/gpu:0'):
# accumulate gradient to get bigger batch_size
sess.run(zero_op)
for _ in range(1, ACCUMULATE_STEP):
sess.run([accum_op, total_loss])
_, _, _, cur_loss, cur_acc, total_step, cur_lr = sess.run([train_op, accum_op, metrics_op, total_loss, accuracy, global_step, lr])
time_elapsed = time.time() - start_time
sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
这里是train_step代码:
def train_step(input_examples, one_hot_labels):
with slim.arg_scope(resnet2.resnet_arg_scope()):
logits, end_points = resnet2.resnet_v2_101(input_examples, NUM_CLASS, is_training=True)
variables_to_restore = slim.get_variables_to_restore(exclude = ['resnet_v2_101/logits'])
end_points['logits_output_squeezed'] = tf.squeeze(logits)
loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = end_points['logits_output_squeezed'], label_smoothing = 0.1)
total_loss = tf.losses.get_total_loss() # obtain the regularization losses as well
global_step = tf.train.get_or_create_global_step(graph = graph)
lr = my_exponential_decay(#tf.train.exponential_decay(
learning_rate = initial_learning_rate,
global_step = global_step,
decay_steps = decay_steps,
decay_rate = learning_rate_decay_factor,
staircase = True)
optimizer = tf.train.AdamOptimizer(learning_rate = lr)
moving_average_variables = slim.get_model_variables()
variable_averages = tf.train.ExponentialMovingAverage(moving_average_decay, global_step)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables))
accumulate_factor = tf.constant([1./ACCUMULATE_STEP])
train_op, accum_ops, zero_ops = my_create_train_op(total_loss, optimizer, False, accumulate_factor)
predictions = tf.argmax(tf.squeeze(end_points['predictions']), 1)
probabilities = end_points['predictions']
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, tf.argmax(one_hot_labels, 1), name='train_accuracy')
metrics_op = tf.group(accuracy_update)
return train_op, accum_ops, zero_ops, global_step, metrics_op, variables_to_restore, predictions, lr, accuracy, total_loss
渐变累积。
def my_create_train_op(total_loss, optimizer, summarize_gradients = False, accumulate_factor=None):
global_step = tf.train.get_or_create_global_step()
update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
if update_ops:
with ops.control_dependencies(update_ops):
barrier = control_flow_ops.no_op(name='update_barrier')
total_loss = control_flow_ops.with_dependencies([barrier], total_loss)
variables_to_train = tf_variables.trainable_variables()
# initialized with 0s
accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in variables_to_train]
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
# Calls the compute_gradients function of the optimizer to obtain... the list of gradients
grads = optimizer.compute_gradients(
total_loss,
variables_to_train,
gate_gradients=tf_optimizer.Optimizer.GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False)
## Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and grads are in the same order)
if accumulate_factor is not None:
total_loss = array_ops.check_numerics(total_loss, 'LossTensor is inf or nan')
with tf.control_dependencies([total_loss]):
accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(grads) if gv[0] is not None]
## Define the training step (part with variable value update)
accumulate_grads = [(tf.multiply(accum_vars[i], accumulate_factor), gv[1]) for i, gv in enumerate(grads) if gv[0] is not None]
else:
accum_ops = tf.no_op(name = 'accum_pass_by')
if accumulate_factor is not None:
# Summarize gradients.
if summarize_gradients:
with ops.name_scope('summarize_grads'):
add_gradients_summaries(accumulate_grads)
grad_updates = optimizer.apply_gradients(accumulate_grads, global_step=global_step)
else:
# Summarize gradients.
if summarize_gradients:
with ops.name_scope('summarize_grads'):
add_gradients_summaries(grads)
grad_updates = optimizer.apply_gradients(grads, global_step=global_step)
with ops.name_scope('train_op'):
# Ensure the train_tensor computes grad_updates.
train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)
# Add the operation used for training to the 'train_op' collection
train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
if train_op not in train_ops:
train_ops.append(train_op)
return train_op, accum_ops, zero_ops