1)获取数据
我有这样的数据代码
train_dataset = tf.data.TFRecordDataset(train_files)
val_dataset = ....
test_dataset = ....
train_iterator = train_dataset.make_initializable_iterator()
val_iterator = val_dataset.make_initializable_iterator()
test_iterator = test_dataset.make_initializable_iterator()
train_iterator = train_iterator.repeat(None) # training dataset repeat indefinitely
val_iterator = val_iterator.repeat(1)
test_iterator = test_iterator.repeat(1)
然后我使用
在它们之间切换handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle, train_dataset.output_types, train_dataset.output_shapes)
image_batch, label_batch = iterator.get_next()
2)建立计算图
网络的结果就像:
is_training = tf.placeholder(tf.bool, name='is_training') # for FC layer
logits_batch = my_net(image_batch, is_training = is_training)
注意。我使用ExponentialMovingAverage进行训练。
g_step_op = tf.Variable(0, trainable=False)
variable_averages = tf.train.ExponentialMovingAverage(FLAGS.moving_average_decay, g_step_op)
variable_averages_op = variable_averages.apply(tf.trainable_variables())
和
acc_value_op, acc_update_op = tf.metrics.accuracy(labels=label_batch, predictions=tf.argmax(tf.nn.softmax(logits_batch), -1))
tf.summary.scalar("accuracy", acc_value_op)
accuracy_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="accuracy")
# accuracy_vars_initializer is used to reset accuracy
accuracy_vars_initializer = tf.variables_initializer(var_list=accuracy_vars)
train_step = ...
with tf.control_dependencies([train_step, variable_averages_op]):
train_op = tf.no_op(name="train")
3)培训网络
我的训练循环是:
steps_per_epoch = number_of_training_data / batch_size
with tf.Session() as sess:
train_iterator_handle = sess.run(train_iterator.string_handle())
val_iterator_handle = sess.run(val_iterator.string_handle())
def val():
# change to val dataset and reset accuracy
sess.run([val_iterator.initializer, accuracy_vars_initializer])
while True:
try:
loss_value, _ = sess.run([loss, acc_update_op],
feed_dict={handle: val_iterator_handle, is_training: False})
acc_value = sess.run(acc_value_op,
feed_dict={handle: val_iterator_handle, is_training: False})
print("val batch loss = %g,val acc = %g." % (loss_value, acc_value))
except tf.errors.OutOfRangeError:
print("val dataset finished")
break
# change back to training dataset
sess.run(train_iterator.initializer)
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
# activate training dataset and reset accuracy
sess.run([train_iterator.initializer, accuracy_vars_initializer])
while True:
try:
_, loss_value, g_step, _ = sess.run([train_op, loss, g_step_op, acc_update_op],
feed_dict={handle: train_iterator_handle, is_training: True})
acc_value = sess.run(acc_value_op,
feed_dict={handle: train_iterator_handle, is_training: True})
print("step[%d],train batch loss = %g,train acc = %g." %
(g_step, loss_value, acc_value))
if g_step % 500 == 500 - 1:
val() # validate the network
if g_step == FLAGS.epoch * steps_per_epoch - 1:
print("train finished")
return
except tf.errors.OutOfRangeError:
print("OutOfRangeError")
break
4)我的问题:
a。(此问题与使用移动平均的训练期间的评估有关)是我可以使用的三种方法
在验证期间使用移动平均变量
(在我的val()函数中)?我知道有一种方法可以先保存模型,然后再恢复模型的移动平均变量以进行验证,但是我认为这会让我的val()需要更多的GPU内存,而我没有更多的内存,因此我认为这不是我想要的是。
b。(此问题与Dataset()有关)是否有一种可以实现的结构化(美丽)方式
重复训练数据集N个历元,并每500步进行一次验证
通过使用train_iterator.repeat(N)
而不是全局步长变量g_step
来决定何时验证和何时停止?