我正在训练ctpn:GitHub link
使用tensorflow-gpu==1.3
例外:
回溯(最近通话最近): 在第54行的“ /home/xxx/text-detection-ctpn/ctpn/train_net.py”文件中 restore = True)
在train_net中的文件“ /home/xxx/text-detection-ctpn/lib/fast_rcnn/train.py”第245行 sw.train_model(sess,max_iters,restore = restore)
在train_model中的文件“ /home/xxx/text-detection-ctpn/lib/fast_rcnn/train.py”,第178行 run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)
ValueError:协议消息RunOptions没有“ report_tensor_allocations_upon_oom”字段。
这是代码:
def train_model(self, sess, max_iters, restore=False):
"""Network training loop."""
with tf.device('/device:GPU:0'):
data_layer = get_data_layer(self.roidb, self.imdb.num_classes)
total_loss, model_loss, rpn_cross_entropy, rpn_loss_box = self.net.build_loss(ohem=cfg.TRAIN.OHEM)
# scalar summary
tf.summary.scalar('rpn_reg_loss', rpn_loss_box)
tf.summary.scalar('rpn_cls_loss', rpn_cross_entropy)
tf.summary.scalar('model_loss', model_loss)
tf.summary.scalar('total_loss', total_loss)
summary_op = tf.summary.merge_all()
# log_image, log_image_data, log_image_name =\
# self.build_image_summary()
# optimizer
lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False)
if cfg.TRAIN.SOLVER == 'Adam':
opt = tf.train.AdamOptimizer(cfg.TRAIN.LEARNING_RATE)
elif cfg.TRAIN.SOLVER == 'RMS':
opt = tf.train.RMSPropOptimizer(cfg.TRAIN.LEARNING_RATE)
else:
# lr = tf.Variable(0.0, trainable=False)
momentum = cfg.TRAIN.MOMENTUM
opt = tf.train.MomentumOptimizer(lr, momentum)
global_step = tf.Variable(0, trainable=False)
with_clip = True
if with_clip:
tvars = tf.trainable_variables()
grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars), 10.0)
train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step)
else:
train_op = opt.minimize(total_loss, global_step=global_step)
# intialize variables
sess.run(tf.global_variables_initializer())
restore_iter = 0
# load vgg16
if self.pretrained_model is not None and not restore:
try:
print(('Loading pretrained model '
'weights from {:s}').format(self.pretrained_model))
self.net.load(self.pretrained_model, sess, True)
except:
raise 'Check your pretrained model {:s}'.format(self.pretrained_model)
if restore:
dir = '/home/xxx/deploy/dev/checkpoints'
ckpt = tf.train.get_checkpoint_state(dir)
print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
self.saver.restore(sess, ckpt.model_checkpoint_path)
stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0]
#restore_iter = int(stem.split('_')[-1])
sess.run(global_step.assign(restore_iter))
print('done')
last_snapshot_iter = -1
timer = Timer()
#print("kkkk")
for iter in range(restore_iter, max_iters):
timer.tic()
# learning rate
if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0:
sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA))
print(lr)
# get one batch
blobs = data_layer.forward() ###########################
#print("qqqq")
feed_dict = {
self.net.data: blobs['data'],
self.net.im_info: blobs['im_info'],
self.net.keep_prob: 0.5,
self.net.gt_boxes: blobs['gt_boxes'],
self.net.gt_ishard: blobs['gt_ishard'],
self.net.dontcare_areas: blobs['dontcare_areas']
}
res_fetches = []
fetch_list = [total_loss, model_loss, rpn_cross_entropy, rpn_loss_box,
summary_op,
train_op] + res_fetches
run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)
total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, \
summary_str, _ = sess.run(fetches=fetch_list, feed_dict=feed_dict, options=run_options)
self.writer.add_summary(summary=summary_str, global_step=global_step.eval())
_diff_time = timer.toc(average=False)
if (iter) % (cfg.TRAIN.DISPLAY) == 0:
print('iter: %d / %d, total loss: %.4f, model loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, lr: %f'%\
(iter, max_iters, total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, lr.eval()))
print('speed: {:.3f}s / iter'.format(_diff_time))
if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0:
last_snapshot_iter = iter
self.snapshot(sess, iter)
如何解决?