Tensorflow移动平均优化器和分配策略

时间:2018-08-06 09:32:22

标签: python-3.x tensorflow moving-average tensorflow-estimator

您好Tensorflow开发人员

我尝试在优化步骤中添加指数移动平均线支持。但是,由于特定于此策略的张量转换方法,由“镜像分发策略”支持的新的Estimator API失败。

当我呼叫 ema.apply_gradients(...)时,它最终出现以下异常:

INFO:tensorflow:Using config: {'_model_dir': './output', 1    365       saving_listeners = _check_listeners_type(saving_listeners)
--> 366       loss = self._train_model(input_fn, hooks, saving_listeners)
    367       logging.info('Loss for final step: %s.', loss)
    368       return self

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
   1115   def _train_model(self, input_fn, hooks, saving_listeners):
   1116     if self._distribution:
-> 1117       return self._train_model_distributed(input_fn, hooks, saving_listeners)
   1118     else:
   1119       return self._train_model_default(input_fn, hooks, saving_listeners)

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
   1158             labels,  # although this will be None it seems
   1159             model_fn_lib.ModeKeys.TRAIN,
-> 1160             self.config)
   1161 
   1162         # TODO(anjalisridhar): Figure out how to resolve the following scaffold

/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py in call_for_each_tower(self, fn, *args, **kwargs)
    792     """
    793     _require_cross_tower_context(self)
--> 794     return self._call_for_each_tower(fn, *args, **kwargs)
    795 
    796   def _call_for_each_tower(self, fn, *args, **kwargs):

/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py in _call_for_each_tower(self, fn, *args, **kwargs)
    267       for t in threads:
    268         t.should_run.set()
--> 269       coord.join(threads)
    270 
    271     return values.regroup({t.device: t.main_result for t in threads})

/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads)
    387       self._registered_threads = set()
    388       if self._exc_info_to_raise:
--> 389         six.reraise(*self._exc_info_to_raise)
    390       elif stragglers:
    391         if ignore_live_threads:

/usr/local/lib/python3.5/dist-packages/six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693             raise value
    694         finally:
    695             value = None

/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py in stop_on_exception(self)
    295     """
    296     try:
--> 297       yield
    298     except:  # pylint: disable=bare-except
    299       self.request_stop(ex=sys.exc_info())

/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py in run(self)
    477                 self._captured_var_scope, reuse=self.tower_id > 0), \
    478             variable_scope.variable_creator_scope(self.variable_creator_fn):
--> 479           self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
    480           self.done = True
    481       finally:

/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _call_model_fn(self, features, labels, mode, config)
   1105 
   1106     logging.info('Calling model_fn.')
-> 1107     model_fn_results = self._model_fn(features=features, **kwargs)
   1108     logging.info('Done calling model_fn.')
   1109 

<ipython-input-9-2239e101f763> in model_fn(features, labels, mode)
      3     loss = tfsi_model(features)
      4     if mode == tf.estimator.ModeKeys.TRAIN:
----> 5         train_op, grads, saver = minimize(loss)
      6         writer, merged = prepare_summary(tf.get_default_graph(), loss, grads)
      7         chkpt_hook = tf.train.CheckpointSaverHook(

<ipython-input-7-8dbd2a0df6d6> in minimize(loss)
     17         train_op = ema.apply_gradients(
     18             grads,
---> 19             global_step=tf.train.get_or_create_global_step()
     20         )
     21         return train_op, grads, ema.swapping_saver()

/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/opt/python/training/moving_average_optimizer.py in apply_gradients(self, grads_and_vars, global_step, name)
     97     if self._sequential_update:
     98       with ops.control_dependencies([train_op]):
---> 99         ma_op = self._ema.apply(var_list)
    100     else:
    101       ma_op = self._ema.apply(var_list)

/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/moving_averages.py in apply(self, var_list)
    428         zero_debias = self._averages[var] in zero_debias_true
    429         updates.append(assign_moving_average(
--> 430             self._averages[var], var, decay, zero_debias=zero_debias))
    431       return control_flow_ops.group(*updates, name=scope)
    432 

/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/moving_averages.py in assign_moving_average(variable, value, decay, zero_debias, name)
     82   with ops.name_scope(name, "AssignMovingAvg",
     83                       [variable, value, decay]) as scope:
---> 84     with ops.colocate_with(variable):
     85       decay = ops.convert_to_tensor(1.0 - decay, name="decay")
     86       if decay.dtype != variable.dtype.base_dtype:

/usr/lib/python3.5/contextlib.py in __enter__(self)
     57     def __enter__(self):
     58         try:
---> 59             return next(self.gen)
     60         except StopIteration:
     61             raise RuntimeError("generator didn't yield") from None

/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in _colocate_with_for_gradient(self, op, gradient_uid, ignore_existing)
   4217   def _colocate_with_for_gradient(self, op, gradient_uid,
   4218                                   ignore_existing=False):
-> 4219     with self.colocate_with(op, ignore_existing):
   4220       if gradient_uid is not None and self._control_flow_context is not None:
   4221         try:

/usr/lib/python3.5/contextlib.py in __enter__(self)
     57     def __enter__(self):
     58         try:
---> 59             return next(self.gen)
     60         except StopIteration:
     61             raise RuntimeError("generator didn't yield") from None

/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in colocate_with(self, op, ignore_existing)
   4270     if op is not None and not isinstance(op, Operation):
   4271       # We always want to colocate with the reference op.
-> 4272       op = internal_convert_to_tensor_or_indexed_slices(op, as_ref=True).op
   4273 
   4274     # By default, colocate_with resets the device function stack,

/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor_or_indexed_slices(value, dtype, name, as_ref)
   1266   else:
   1267     return internal_convert_to_tensor(
-> 1268         value, dtype=dtype, name=name, as_ref=as_ref)
   1269 
   1270 

/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, ctx)
   1105 
   1106     if ret is None:
-> 1107       ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
   1108 
   1109     if ret is NotImplemented:

/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/values.py in _tensor_conversion(var, dtype, name, as_ref)
    243   # Try to avoid assignments to and other mutations of MirroredVariable
    244   # state except through a DistributionStrategy.update() call.
--> 245   assert not as_ref
    246   return ops.internal_convert_to_tensor(
    247       var.get(), dtype=dtype, name=name, as_ref=as_ref)

AssertionError: 

以下是用于创建优化器并将反向传播应用于指定损失函数的代码:

def minimize(loss):

    lr = tf.constant(learning_rate_schedule[0], dtype=tf.float32)
    for key, val in learning_rate_schedule.items():
        lr = tf.cond(
            tf.less(tf.train.get_or_create_global_step(), key), 
            lambda : lr,
            lambda : tf.constant(val, dtype=tf.float32)
        )
    opt = tf.train.AdamOptimizer(learning_rate=lr, epsilon=FLAGS.epsilon)
    if FLAGS.is_ema_enabled:
        ema = tf.contrib.opt.MovingAverageOptimizer(
            opt, 
            num_updates=tf.train.get_or_create_global_step()
        )
        grads = ema.compute_gradients(loss)
        train_op = ema.apply_gradients(
            grads,
            global_step=tf.train.get_or_create_global_step()
        )
        return train_op, grads, ema.swapping_saver()
    else:
        grads = opt.compute_gradients(loss)
        train_op = opt.apply_gradients(
            grads, 
            global_step=tf.train.get_or_create_global_step()
        )
        return train_op, grads, tf.train.Saver()
  1. 我的机器安装了两个NVIDIA GPU(Tesla K40和Quadro K620),并在Ubuntu 16.04平台上安装了一个docker。
  2. 我的tensorflow版本是1.9.0_gpu。它是从源代码构建的。

internal_tensor_conversion ”收到参考变量时,尽管我不确定,但似乎会引起麻烦。涉及具有分配策略支持的高级API时,应该如何将移动平均应用于优化方法?

谢谢您的帮助。

0 个答案:

没有答案