您好Tensorflow开发人员
我尝试在优化步骤中添加指数移动平均线支持。但是,由于特定于此策略的张量转换方法,由“镜像分发策略”支持的新的Estimator API失败。
当我呼叫 ema.apply_gradients(...)时,它最终出现以下异常:
INFO:tensorflow:Using config: {'_model_dir': './output', 1 365 saving_listeners = _check_listeners_type(saving_listeners)
--> 366 loss = self._train_model(input_fn, hooks, saving_listeners)
367 logging.info('Loss for final step: %s.', loss)
368 return self
/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1115 def _train_model(self, input_fn, hooks, saving_listeners):
1116 if self._distribution:
-> 1117 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1118 else:
1119 return self._train_model_default(input_fn, hooks, saving_listeners)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
1158 labels, # although this will be None it seems
1159 model_fn_lib.ModeKeys.TRAIN,
-> 1160 self.config)
1161
1162 # TODO(anjalisridhar): Figure out how to resolve the following scaffold
/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py in call_for_each_tower(self, fn, *args, **kwargs)
792 """
793 _require_cross_tower_context(self)
--> 794 return self._call_for_each_tower(fn, *args, **kwargs)
795
796 def _call_for_each_tower(self, fn, *args, **kwargs):
/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py in _call_for_each_tower(self, fn, *args, **kwargs)
267 for t in threads:
268 t.should_run.set()
--> 269 coord.join(threads)
270
271 return values.regroup({t.device: t.main_result for t in threads})
/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads)
387 self._registered_threads = set()
388 if self._exc_info_to_raise:
--> 389 six.reraise(*self._exc_info_to_raise)
390 elif stragglers:
391 if ignore_live_threads:
/usr/local/lib/python3.5/dist-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py in stop_on_exception(self)
295 """
296 try:
--> 297 yield
298 except: # pylint: disable=bare-except
299 self.request_stop(ex=sys.exc_info())
/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py in run(self)
477 self._captured_var_scope, reuse=self.tower_id > 0), \
478 variable_scope.variable_creator_scope(self.variable_creator_fn):
--> 479 self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
480 self.done = True
481 finally:
/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py in _call_model_fn(self, features, labels, mode, config)
1105
1106 logging.info('Calling model_fn.')
-> 1107 model_fn_results = self._model_fn(features=features, **kwargs)
1108 logging.info('Done calling model_fn.')
1109
<ipython-input-9-2239e101f763> in model_fn(features, labels, mode)
3 loss = tfsi_model(features)
4 if mode == tf.estimator.ModeKeys.TRAIN:
----> 5 train_op, grads, saver = minimize(loss)
6 writer, merged = prepare_summary(tf.get_default_graph(), loss, grads)
7 chkpt_hook = tf.train.CheckpointSaverHook(
<ipython-input-7-8dbd2a0df6d6> in minimize(loss)
17 train_op = ema.apply_gradients(
18 grads,
---> 19 global_step=tf.train.get_or_create_global_step()
20 )
21 return train_op, grads, ema.swapping_saver()
/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/opt/python/training/moving_average_optimizer.py in apply_gradients(self, grads_and_vars, global_step, name)
97 if self._sequential_update:
98 with ops.control_dependencies([train_op]):
---> 99 ma_op = self._ema.apply(var_list)
100 else:
101 ma_op = self._ema.apply(var_list)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/moving_averages.py in apply(self, var_list)
428 zero_debias = self._averages[var] in zero_debias_true
429 updates.append(assign_moving_average(
--> 430 self._averages[var], var, decay, zero_debias=zero_debias))
431 return control_flow_ops.group(*updates, name=scope)
432
/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/moving_averages.py in assign_moving_average(variable, value, decay, zero_debias, name)
82 with ops.name_scope(name, "AssignMovingAvg",
83 [variable, value, decay]) as scope:
---> 84 with ops.colocate_with(variable):
85 decay = ops.convert_to_tensor(1.0 - decay, name="decay")
86 if decay.dtype != variable.dtype.base_dtype:
/usr/lib/python3.5/contextlib.py in __enter__(self)
57 def __enter__(self):
58 try:
---> 59 return next(self.gen)
60 except StopIteration:
61 raise RuntimeError("generator didn't yield") from None
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in _colocate_with_for_gradient(self, op, gradient_uid, ignore_existing)
4217 def _colocate_with_for_gradient(self, op, gradient_uid,
4218 ignore_existing=False):
-> 4219 with self.colocate_with(op, ignore_existing):
4220 if gradient_uid is not None and self._control_flow_context is not None:
4221 try:
/usr/lib/python3.5/contextlib.py in __enter__(self)
57 def __enter__(self):
58 try:
---> 59 return next(self.gen)
60 except StopIteration:
61 raise RuntimeError("generator didn't yield") from None
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in colocate_with(self, op, ignore_existing)
4270 if op is not None and not isinstance(op, Operation):
4271 # We always want to colocate with the reference op.
-> 4272 op = internal_convert_to_tensor_or_indexed_slices(op, as_ref=True).op
4273
4274 # By default, colocate_with resets the device function stack,
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor_or_indexed_slices(value, dtype, name, as_ref)
1266 else:
1267 return internal_convert_to_tensor(
-> 1268 value, dtype=dtype, name=name, as_ref=as_ref)
1269
1270
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, ctx)
1105
1106 if ret is None:
-> 1107 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
1108
1109 if ret is NotImplemented:
/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/values.py in _tensor_conversion(var, dtype, name, as_ref)
243 # Try to avoid assignments to and other mutations of MirroredVariable
244 # state except through a DistributionStrategy.update() call.
--> 245 assert not as_ref
246 return ops.internal_convert_to_tensor(
247 var.get(), dtype=dtype, name=name, as_ref=as_ref)
AssertionError:
以下是用于创建优化器并将反向传播应用于指定损失函数的代码:
def minimize(loss):
lr = tf.constant(learning_rate_schedule[0], dtype=tf.float32)
for key, val in learning_rate_schedule.items():
lr = tf.cond(
tf.less(tf.train.get_or_create_global_step(), key),
lambda : lr,
lambda : tf.constant(val, dtype=tf.float32)
)
opt = tf.train.AdamOptimizer(learning_rate=lr, epsilon=FLAGS.epsilon)
if FLAGS.is_ema_enabled:
ema = tf.contrib.opt.MovingAverageOptimizer(
opt,
num_updates=tf.train.get_or_create_global_step()
)
grads = ema.compute_gradients(loss)
train_op = ema.apply_gradients(
grads,
global_step=tf.train.get_or_create_global_step()
)
return train_op, grads, ema.swapping_saver()
else:
grads = opt.compute_gradients(loss)
train_op = opt.apply_gradients(
grads,
global_step=tf.train.get_or_create_global_step()
)
return train_op, grads, tf.train.Saver()
“ internal_tensor_conversion ”收到参考变量时,尽管我不确定,但似乎会引起麻烦。涉及具有分配策略支持的高级API时,应该如何将移动平均应用于优化方法?
谢谢您的帮助。