我在 conda 环境中使用TF2.1,
并尝试以float16精度构建和训练模型,以减小模型的尺寸。
我已经用一个GPU成功地训练了模型。
但是失败tf.distribute.MirroredStrategy()。
我的代码简短如下:
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('float16')
mixed_precision.set_policy(policy)
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
# create model
model = tf_models()
# compile model
model.compile(loss='categorical_crossentropy',
optimizer=Adam(),
metrics=['categorical_accuracy'])
history = model.fit(
x=X_train,
y=y_train,
validation_data=validation_data,
callbacks=callback_list,
verbose=2,
epochs=num_epochs,
batch_size=batch_size,
shuffle=True
)
并且输出错误消息在下面
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(op_type_name, name, **keywords)
411 preferred_dtype=default_dtype,
--> 412 as_ref=input_arg.is_ref)
413 if input_arg.number_attr and len(
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in internal_convert_n_to_tensor(values, dtype, name, as_ref, preferred_dtype, ctx)
1381 preferred_dtype=preferred_dtype,
-> 1382 ctx=ctx))
1383 return ret
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1289 "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
-> 1290 (dtype.name, value.dtype.name, value))
1291 return value
ValueError: Tensor conversion requested dtype float32 for Tensor with dtype float16: <tf.Tensor 'Adam/Reshape_2:0' shape=(3456,) dtype=float16>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-14-c23115b92e82> in <module>
39 epochs=num_epochs,
40 batch_size=batch_size,
---> 41 shuffle=True
42 )
43
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
340 mode=ModeKeys.TRAIN,
341 training_context=training_context,
--> 342 total_epochs=epochs)
343 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
344
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
566 xla_context.Exit()
567 else:
--> 568 result = self._call(*args, **kwds)
569
570 if tracing_count == self._get_tracing_count():
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
613 # This is the first call of __call__, so we have to initialize.
614 initializers = []
--> 615 self._initialize(args, kwds, add_initializers_to=initializers)
616 finally:
617 # At this point we know that the initialization is complete (or less
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
495 self._concrete_stateful_fn = (
496 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 497 *args, **kwds))
498
499 def invalid_creator_scope(*unused_args, **unused_kwds):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2387 args, kwargs = None, None
2388 with self._lock:
-> 2389 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2390 return graph_function
2391
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2701
2702 self._function_cache.missed.add(call_context_key)
-> 2703 graph_function = self._create_graph_function(args, kwargs)
2704 self._function_cache.primary[cache_key] = graph_function
2705 return graph_function, args, kwargs
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2591 arg_names=arg_names,
2592 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2593 capture_by_value=self._capture_by_value),
2594 self._function_attributes,
2595 # Tell the ConcreteFunction to clean up its graph once it goes out of
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
976 converted_func)
977
--> 978 func_outputs = python_func(*func_args, **func_kwargs)
979
980 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
437 # __wrapped__ allows AutoGraph to swap in a converted function. We give
438 # the function a weak reference to itself to avoid a reference cycle.
--> 439 return weak_wrapped_fn().__wrapped__(*args, **kwds)
440 weak_wrapped_fn = weakref.ref(wrapped_fn)
441
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in distributed_function(input_iterator)
83 args = _prepare_feed_values(model, input_iterator, mode, strategy)
84 outputs = strategy.experimental_run_v2(
---> 85 per_replica_function, args=args)
86 # Out of PerReplica outputs reduce or pick values to return.
87 all_outputs = dist_utils.unwrap_output_dict(
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in experimental_run_v2(self, fn, args, kwargs)
761 fn = autograph.tf_convert(fn, ag_ctx.control_status_ctx(),
762 convert_by_default=False)
--> 763 return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
764
765 def reduce(self, reduce_op, value, axis):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in call_for_each_replica(self, fn, args, kwargs)
1817 kwargs = {}
1818 with self._container_strategy().scope():
-> 1819 return self._call_for_each_replica(fn, args, kwargs)
1820
1821 def _call_for_each_replica(self, fn, args, kwargs):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(self, fn, args, kwargs)
692 self._container_strategy().__class__.__name__, 5)
693 return _call_for_each_replica(self._container_strategy(), self._device_map,
--> 694 fn, args, kwargs)
695
696 def _configure(self,
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(distribution, device_map, fn, args, kwargs)
199 for t in threads:
200 t.should_run.set()
--> 201 coord.join(threads)
202
203 return values.regroup(device_map, tuple(t.main_result for t in threads))
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads)
387 self._registered_threads = set()
388 if self._exc_info_to_raise:
--> 389 six.reraise(*self._exc_info_to_raise)
390 elif stragglers:
391 if ignore_live_threads:
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py in stop_on_exception(self)
295 """
296 try:
--> 297 yield
298 except: # pylint: disable=bare-except
299 self.request_stop(ex=sys.exc_info())
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(distribution, device_map, fn, args, kwargs)
193 variable_scope.variable_scope(mtt_captured_var_scope):
194 merge_result = threads[0].merge_fn(distribution, *merge_args,
--> 195 **merge_kwargs)
196 for r, t in enumerate(threads):
197 t.merge_result = values.select_replica(r, merge_result)
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in _distributed_apply(self, distribution, grads_and_vars, name, apply_state)
447 """`apply_gradients` using a `DistributionStrategy`."""
448 reduced_grads = distribution.extended.batch_reduce_to(
--> 449 ds_reduce_util.ReduceOp.SUM, grads_and_vars)
450 var_list = [v for _, v in grads_and_vars]
451 grads_and_vars = zip(reduced_grads, var_list)
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in batch_reduce_to(self, reduce_op, value_destination_pairs)
1492 if isinstance(reduce_op, six.string_types):
1493 reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-> 1494 return self._batch_reduce_to(reduce_op, value_destination_pairs)
1495
1496 def _batch_reduce_to(self, reduce_op, value_destination_pairs):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _batch_reduce_to(self, reduce_op, value_destination_pairs)
738 def _batch_reduce_to(self, reduce_op, value_destination_pairs):
739 return self._get_cross_device_ops().batch_reduce(reduce_op,
--> 740 value_destination_pairs)
741
742 def _update(self, var, fn, args, kwargs, group):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in batch_reduce(self, reduce_op, value_destination_pairs)
325 ]
326
--> 327 return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
328
329 def broadcast(self, tensor, destinations):
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in batch_reduce_implementation(self, reduce_op, value_destination_pairs)
728 if _all_devices_match(value_destination_pairs):
729 return self._batch_all_reduce(reduce_op,
--> 730 [v[0] for v in value_destination_pairs])
731 else:
732 return [
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _batch_all_reduce(self, reduce_op, per_replica_values)
740 cross_device_utils.split_by_sparsity(per_replica_values))
741 if dense_values:
--> 742 dense_results = self._do_batch_all_reduce(reduce_op, dense_values)
743 else:
744 dense_results = []
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _do_batch_all_reduce(self, reduce_op, dense_values)
765 device_grad_packs, tensor_packer = _pack_tensors(
766 grouped, self._num_packs, self._agg_small_grads_max_bytes,
--> 767 self._agg_small_grads_max_group)
768
769 # The actual aggregation of the repacked gradients. Note that they are
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _pack_tensors(device_grads, num_packs, agg_small_grads_max_bytes, agg_small_grads_max_group)
666 if num_packs > 0:
667 tensor_packer = _ConcatAndSplitPacker(num_packs)
--> 668 device_grad_packs = tensor_packer.pack(device_grads)
669 elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
670 tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in pack(self, grouped_grads_and_vars)
550 device_sizes = [array_ops.size(g) for g, _ in device_grads_and_vars]
551 # Concat all the flat grads into a big flat tensor.
--> 552 concat_grads = array_ops.concat(flat_grads, 0)
553
554 # Split the big tensor into num_splits packs. In cases where the
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/util/dispatch.py in wrapper(*args, **kwargs)
178 """Call target, and fall back on dispatchers if there is a TypeError."""
179 try:
--> 180 return target(*args, **kwargs)
181 except (TypeError, ValueError):
182 # Note: convert_to_eager_tensor currently raises a ValueError, not a
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py in concat(values, axis, name)
1515 dtype=dtypes.int32).get_shape().assert_has_rank(0)
1516 return identity(values[0], name=name)
-> 1517 return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
1518
1519
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_array_ops.py in concat_v2(values, axis, name)
1124 _attr_N = len(values)
1125 _, _, _op, _outputs = _op_def_library._apply_op_helper(
-> 1126 "ConcatV2", values=values, axis=axis, name=name)
1127 _result = _outputs[:]
1128 if _execute.must_record_gradient():
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(op_type_name, name, **keywords)
438 (prefix, dtype.name))
439 else:
--> 440 raise TypeError("%s that don't all match." % prefix)
441 else:
442 raise TypeError(
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [float32, float32, float16, float16, float32, float32, float16, float16, float32, float16, float16, float16, float16, float32, float32, float16, float16, float16, float16, float16, float16, float16, float16, float32, float32, float32, float32, float16, float16, float32, float16, float16, float16, float16, float32, float32, float16, float16, float16, float16, float16, float16, float16, float16, float32, float32, float32, float32, float16, float16, float32, float32, float16, float16, float32, float32] that don't all match.