我想在模型训练中分析操作状态。因此,我在会话运行中设置了“ RunOptions”和“ RunMetadata”。但是结果表明,“ global_step / sec”慢一倍。在添加用于记录时间轴的代码之前,它非常稳定。结果在这里:
这是我的代码:
record = hvd.rank() == 0
run_options = None
run_metadata = None
if record:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
hooks = [
hvd.BroadcastGlobalVariablesHook(0),
tf.train.SummarySaverHook(save_secs=FLAGS.save_interval_secs,
output_dir=FLAGS.train_dir,summary_op=summary_op),
tf.train.StopAtStepHook(last_step=FLAGS.max_steps // hvd.size()),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': total_loss}, every_n_iter=10),
]
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
hooks=hooks,
config=config,
log_step_count_steps=FLAGS.log_every_n_steps,
save_summaries_secs=FLAGS.save_summaries_secs
) as mon_sess:
while not mon_sess.should_stop():
mon_sess.run(train_op,options=run_options, run_metadata=run_metadata)
if record:
tl = timeline.Timeline(run_metadata.step_stats)
ctf = tl.generate_chrome_trace_format()
with open('timeline.json', 'w') as f:
f.write(ctf)
无论是一个GPU还是两个GPU。