协议消息RunOptions没有“ report_tensor_allocations_upon_oom”字段

时间:2019-09-17 09:25:57

标签: python tensorflow deep-learning

我正在训练ctpn:GitHub link
使用tensorflow-gpu==1.3

  

例外:

     

回溯(最近通话最近):     在第54行的“ /home/xxx/text-detection-ctpn/ctpn/train_net.py”文件中       restore = True)
    在train_net中的文件“ /home/xxx/text-detection-ctpn/lib/fast_rcnn/train.py”第245行       sw.train_model(sess,max_iters,restore = restore)
    在train_model中的文件“ /home/xxx/text-detection-ctpn/lib/fast_rcnn/train.py”,第178行       run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)
  ValueError:协议消息RunOptions没有“ report_tensor_allocations_upon_oom”字段。

这是代码:

def train_model(self, sess, max_iters, restore=False):
    """Network training loop."""
    with tf.device('/device:GPU:0'):
        data_layer = get_data_layer(self.roidb, self.imdb.num_classes)
        total_loss, model_loss, rpn_cross_entropy, rpn_loss_box = self.net.build_loss(ohem=cfg.TRAIN.OHEM)
        # scalar summary
        tf.summary.scalar('rpn_reg_loss', rpn_loss_box)
        tf.summary.scalar('rpn_cls_loss', rpn_cross_entropy)
        tf.summary.scalar('model_loss', model_loss)
        tf.summary.scalar('total_loss', total_loss)
        summary_op = tf.summary.merge_all()

        # log_image, log_image_data, log_image_name =\
        #     self.build_image_summary()

        # optimizer
        lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False)
        if cfg.TRAIN.SOLVER == 'Adam':
            opt = tf.train.AdamOptimizer(cfg.TRAIN.LEARNING_RATE)
        elif cfg.TRAIN.SOLVER == 'RMS':
            opt = tf.train.RMSPropOptimizer(cfg.TRAIN.LEARNING_RATE)
        else:
            # lr = tf.Variable(0.0, trainable=False)
            momentum = cfg.TRAIN.MOMENTUM
            opt = tf.train.MomentumOptimizer(lr, momentum)

        global_step = tf.Variable(0, trainable=False)
        with_clip = True
        if with_clip:
            tvars = tf.trainable_variables()
            grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars), 10.0)
            train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step)
        else:
            train_op = opt.minimize(total_loss, global_step=global_step)

        # intialize variables
        sess.run(tf.global_variables_initializer())
        restore_iter = 0

        # load vgg16
        if self.pretrained_model is not None and not restore:
            try:
                print(('Loading pretrained model '
                   'weights from {:s}').format(self.pretrained_model))
                self.net.load(self.pretrained_model, sess, True)  
            except:
                raise 'Check your pretrained model {:s}'.format(self.pretrained_model)

        if restore:
            dir = '/home/xxx/deploy/dev/checkpoints'

            ckpt = tf.train.get_checkpoint_state(dir)
            print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
            self.saver.restore(sess, ckpt.model_checkpoint_path)
            stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0]
            #restore_iter = int(stem.split('_')[-1])
            sess.run(global_step.assign(restore_iter))
            print('done')

        last_snapshot_iter = -1
        timer = Timer()
        #print("kkkk")
        for iter in range(restore_iter, max_iters):
            timer.tic()
            # learning rate
            if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0:
                sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA))
                print(lr)

            # get one batch
            blobs = data_layer.forward()  ###########################
            #print("qqqq")
            feed_dict = {
                self.net.data: blobs['data'],
                self.net.im_info: blobs['im_info'],
                self.net.keep_prob: 0.5,
                self.net.gt_boxes: blobs['gt_boxes'],    
                self.net.gt_ishard: blobs['gt_ishard'],
                self.net.dontcare_areas: blobs['dontcare_areas']
            }
            res_fetches = []
            fetch_list = [total_loss, model_loss, rpn_cross_entropy, rpn_loss_box,
                          summary_op,
                          train_op] + res_fetches
            run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)    

            total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, \
                summary_str, _ = sess.run(fetches=fetch_list, feed_dict=feed_dict, options=run_options)

            self.writer.add_summary(summary=summary_str, global_step=global_step.eval())

            _diff_time = timer.toc(average=False)


            if (iter) % (cfg.TRAIN.DISPLAY) == 0:
                print('iter: %d / %d, total loss: %.4f, model loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, lr: %f'%\
                        (iter, max_iters, total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, lr.eval()))
                print('speed: {:.3f}s / iter'.format(_diff_time))

            if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0:
                last_snapshot_iter = iter
                self.snapshot(sess, iter)

如何解决?

0 个答案:

没有答案