我正在重新实现张量流模型。在开始训练之前,我运行以下代码:
init = tf.global_variables_initializer()
sess.run(init)
但是,我仍然得到uninitialized variable error
,并且培训过程无法开始。我知道有一个类似的问题here,但并不能解决我的问题。我在单个gpu版本和多个gpu版本之间切换代码,但这也无济于事。我在这里发布了部分代码。尽管不是整个项目,但我认为足以显示所有关键点。请给我一些帮助。
谢谢大家对我的帮助!
train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir =数据集_dir,标志='train' ) val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir =数据集_dir,标志='val' )
# set lanenet
train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=False)
val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True)
# set compute graph node
train_images, train_binary_labels, train_instance_labels = train_dataset.inputs(
CFG.TRAIN.BATCH_SIZE, 1
)
val_images, val_binary_labels, val_instance_labels = val_dataset.inputs(
CFG.TRAIN.VAL_BATCH_SIZE, 1
)
# set average container
tower_grads = []
train_tower_loss = []
val_tower_loss = []
batchnorm_updates = None
train_summary_op_updates = None
# set lr
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.polynomial_decay(
learning_rate=CFG.TRAIN.LEARNING_RATE,
global_step=global_step,
decay_steps=CFG.TRAIN.EPOCHS,
power=0.9
)
# set optimizer
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM
)
# set distributed train op
with tf.variable_scope(tf.get_variable_scope()):
for i in range(CFG.TRAIN.GPU_NUM):
with tf.device('/gpu:{:d}'.format(i)):
with tf.name_scope('tower_{:d}'.format(i)) as _:
train_loss, grads = compute_net_gradients(
train_images, train_binary_labels, train_instance_labels, train_net, optimizer
)
# Only use the mean and var in the first gpu tower to update the parameter
if i == 0:
batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
train_summary_op_updates = tf.get_collection(tf.GraphKeys.SUMMARIES)
tower_grads.append(grads)
train_tower_loss.append(train_loss)
with tf.name_scope('validation_{:d}'.format(i)) as _:
val_loss, _ = compute_net_gradients(
val_images, val_binary_labels, val_instance_labels, val_net, optimizer)
val_tower_loss.append(val_loss)
grads = average_gradients(tower_grads)
avg_train_loss = tf.reduce_mean(train_tower_loss)
avg_val_loss = tf.reduce_mean(val_tower_loss)
# Track the moving averages of all trainable variables
variable_averages = tf.train.ExponentialMovingAverage(
CFG.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step)
variables_to_average = tf.trainable_variables() + tf.moving_average_variables()
variables_averages_op = variable_averages.apply(variables_to_average)
# Group all the op needed for training
batchnorm_updates_op = tf.group(*batchnorm_updates)
apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
train_op = tf.group(apply_gradient_op, variables_averages_op,
batchnorm_updates_op)
# Set tf summary save path
tboard_save_path = 'tboard/tusimple_lanenet_multi_gpu_{:s}'.format(net_flag)
os.makedirs(tboard_save_path, exist_ok=True)
summary_writer = tf.summary.FileWriter(tboard_save_path)
avg_train_loss_scalar = tf.summary.scalar(
name='average_train_loss', tensor=avg_train_loss
)
avg_val_loss_scalar = tf.summary.scalar(
name='average_val_loss', tensor=avg_val_loss
)
learning_rate_scalar = tf.summary.scalar(
name='learning_rate_scalar', tensor=learning_rate
)
train_merge_summary_op = tf.summary.merge(
[avg_train_loss_scalar, learning_rate_scalar] + train_summary_op_updates
)
val_merge_summary_op = tf.summary.merge([avg_val_loss_scalar])
# set tensorflow saver
saver = tf.train.Saver()
model_save_dir = 'model/tusimple_lanenet_multi_gpu_{:s}'.format(net_flag)
os.makedirs(model_save_dir, exist_ok=True)
train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format(net_flag, str(train_start_time))
model_save_path = ops.join(model_save_dir, model_name)
# set sess config
sess_config = tf.ConfigProto(device_count={'GPU': CFG.TRAIN.GPU_NUM}, allow_soft_placement=True)
sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION
sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH
sess_config.gpu_options.allocator_type = 'BFC'
# Set the training parameters
train_epochs = CFG.TRAIN.EPOCHS
log.info('Global configuration is as follows:')
log.info(CFG)
sess = tf.Session(config=sess_config)
summary_writer.add_graph(sess.graph)
with sess.as_default():
tf.train.write_graph(
graph_or_graph_def=sess.graph, logdir='',
name='{:s}/lanenet_model.pb'.format(model_save_dir))
if weights_path is None:
log.info('Training from scratch')
init = tf.global_variables_initializer()
sess.run(init)
else:
log.info('Restore model from last model checkpoint {:s}'.format(weights_path))
saver.restore(sess=sess, save_path=weights_path)