(TF2.1,错误)使用MirroredStrategy训练float16模型

时间:2020-04-13 02:37:00

标签: tensorflow

我在 conda 环境中使用TF2.1,
并尝试以float16精度构建和训练模型,以减小模型的尺寸。

我已经用一个GPU成功地训练了模型。
但是失败tf.distribute.MirroredStrategy()

我的代码简短如下:

from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('float16')
mixed_precision.set_policy(policy)

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    # create model
    model = tf_models()

    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(),
                  metrics=['categorical_accuracy'])
history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=validation_data,
    callbacks=callback_list,
    verbose=2,
    epochs=num_epochs,
    batch_size=batch_size,
    shuffle=True
)

并且输出错误消息在下面

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(op_type_name, name, **keywords)
    411               preferred_dtype=default_dtype,
--> 412               as_ref=input_arg.is_ref)
    413           if input_arg.number_attr and len(

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in internal_convert_n_to_tensor(values, dtype, name, as_ref, preferred_dtype, ctx)
   1381             preferred_dtype=preferred_dtype,
-> 1382             ctx=ctx))
   1383   return ret

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
   1289           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
-> 1290           (dtype.name, value.dtype.name, value))
   1291     return value

ValueError: Tensor conversion requested dtype float32 for Tensor with dtype float16: <tf.Tensor 'Adam/Reshape_2:0' shape=(3456,) dtype=float16>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-14-c23115b92e82> in <module>
     39     epochs=num_epochs,
     40     batch_size=batch_size,
---> 41     shuffle=True
     42 )
     43 

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817         max_queue_size=max_queue_size,
    818         workers=workers,
--> 819         use_multiprocessing=use_multiprocessing)
    820 
    821   def evaluate(self,

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    340                 mode=ModeKeys.TRAIN,
    341                 training_context=training_context,
--> 342                 total_epochs=epochs)
    343             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    344 

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    126         step=step, mode=mode, size=current_batch_size) as batch_logs:
    127       try:
--> 128         batch_outs = execution_function(iterator)
    129       except (StopIteration, errors.OutOfRangeError):
    130         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
     96     # `numpy` translates Tensors to values in Eager mode.
     97     return nest.map_structure(_non_none_constant_value,
---> 98                               distributed_function(input_fn))
     99 
    100   return execution_function

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
    566         xla_context.Exit()
    567     else:
--> 568       result = self._call(*args, **kwds)
    569 
    570     if tracing_count == self._get_tracing_count():

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
    613       # This is the first call of __call__, so we have to initialize.
    614       initializers = []
--> 615       self._initialize(args, kwds, add_initializers_to=initializers)
    616     finally:
    617       # At this point we know that the initialization is complete (or less

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    495     self._concrete_stateful_fn = (
    496         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 497             *args, **kwds))
    498 
    499     def invalid_creator_scope(*unused_args, **unused_kwds):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2387       args, kwargs = None, None
   2388     with self._lock:
-> 2389       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2390     return graph_function
   2391 

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   2701 
   2702       self._function_cache.missed.add(call_context_key)
-> 2703       graph_function = self._create_graph_function(args, kwargs)
   2704       self._function_cache.primary[cache_key] = graph_function
   2705       return graph_function, args, kwargs

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   2591             arg_names=arg_names,
   2592             override_flat_arg_shapes=override_flat_arg_shapes,
-> 2593             capture_by_value=self._capture_by_value),
   2594         self._function_attributes,
   2595         # Tell the ConcreteFunction to clean up its graph once it goes out of

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    976                                           converted_func)
    977 
--> 978       func_outputs = python_func(*func_args, **func_kwargs)
    979 
    980       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    437         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    438         # the function a weak reference to itself to avoid a reference cycle.
--> 439         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    440     weak_wrapped_fn = weakref.ref(wrapped_fn)
    441 

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in distributed_function(input_iterator)
     83     args = _prepare_feed_values(model, input_iterator, mode, strategy)
     84     outputs = strategy.experimental_run_v2(
---> 85         per_replica_function, args=args)
     86     # Out of PerReplica outputs reduce or pick values to return.
     87     all_outputs = dist_utils.unwrap_output_dict(

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in experimental_run_v2(self, fn, args, kwargs)
    761       fn = autograph.tf_convert(fn, ag_ctx.control_status_ctx(),
    762                                 convert_by_default=False)
--> 763       return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    764 
    765   def reduce(self, reduce_op, value, axis):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in call_for_each_replica(self, fn, args, kwargs)
   1817       kwargs = {}
   1818     with self._container_strategy().scope():
-> 1819       return self._call_for_each_replica(fn, args, kwargs)
   1820 
   1821   def _call_for_each_replica(self, fn, args, kwargs):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(self, fn, args, kwargs)
    692                           self._container_strategy().__class__.__name__, 5)
    693     return _call_for_each_replica(self._container_strategy(), self._device_map,
--> 694                                   fn, args, kwargs)
    695 
    696   def _configure(self,

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(distribution, device_map, fn, args, kwargs)
    199     for t in threads:
    200       t.should_run.set()
--> 201     coord.join(threads)
    202 
    203   return values.regroup(device_map, tuple(t.main_result for t in threads))

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads)
    387       self._registered_threads = set()
    388       if self._exc_info_to_raise:
--> 389         six.reraise(*self._exc_info_to_raise)
    390       elif stragglers:
    391         if ignore_live_threads:

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
    701             if value.__traceback__ is not tb:
    702                 raise value.with_traceback(tb)
--> 703             raise value
    704         finally:
    705             value = None

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py in stop_on_exception(self)
    295     """
    296     try:
--> 297       yield
    298     except:  # pylint: disable=bare-except
    299       self.request_stop(ex=sys.exc_info())

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _call_for_each_replica(distribution, device_map, fn, args, kwargs)
    193               variable_scope.variable_scope(mtt_captured_var_scope):
    194             merge_result = threads[0].merge_fn(distribution, *merge_args,
--> 195                                                **merge_kwargs)
    196           for r, t in enumerate(threads):
    197             t.merge_result = values.select_replica(r, merge_result)

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in _distributed_apply(self, distribution, grads_and_vars, name, apply_state)
    447     """`apply_gradients` using a `DistributionStrategy`."""
    448     reduced_grads = distribution.extended.batch_reduce_to(
--> 449         ds_reduce_util.ReduceOp.SUM, grads_and_vars)
    450     var_list = [v for _, v in grads_and_vars]
    451     grads_and_vars = zip(reduced_grads, var_list)

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py in batch_reduce_to(self, reduce_op, value_destination_pairs)
   1492     if isinstance(reduce_op, six.string_types):
   1493       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-> 1494     return self._batch_reduce_to(reduce_op, value_destination_pairs)
   1495 
   1496   def _batch_reduce_to(self, reduce_op, value_destination_pairs):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py in _batch_reduce_to(self, reduce_op, value_destination_pairs)
    738   def _batch_reduce_to(self, reduce_op, value_destination_pairs):
    739     return self._get_cross_device_ops().batch_reduce(reduce_op,
--> 740                                                      value_destination_pairs)
    741 
    742   def _update(self, var, fn, args, kwargs, group):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in batch_reduce(self, reduce_op, value_destination_pairs)
    325       ]
    326 
--> 327     return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
    328 
    329   def broadcast(self, tensor, destinations):

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in batch_reduce_implementation(self, reduce_op, value_destination_pairs)
    728     if _all_devices_match(value_destination_pairs):
    729       return self._batch_all_reduce(reduce_op,
--> 730                                     [v[0] for v in value_destination_pairs])
    731     else:
    732       return [

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _batch_all_reduce(self, reduce_op, per_replica_values)
    740         cross_device_utils.split_by_sparsity(per_replica_values))
    741     if dense_values:
--> 742       dense_results = self._do_batch_all_reduce(reduce_op, dense_values)
    743     else:
    744       dense_results = []

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _do_batch_all_reduce(self, reduce_op, dense_values)
    765     device_grad_packs, tensor_packer = _pack_tensors(
    766         grouped, self._num_packs, self._agg_small_grads_max_bytes,
--> 767         self._agg_small_grads_max_group)
    768 
    769     # The actual aggregation of the repacked gradients. Note that they are

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in _pack_tensors(device_grads, num_packs, agg_small_grads_max_bytes, agg_small_grads_max_group)
    666   if num_packs > 0:
    667     tensor_packer = _ConcatAndSplitPacker(num_packs)
--> 668     device_grad_packs = tensor_packer.pack(device_grads)
    669   elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
    670     tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/distribute/cross_device_ops.py in pack(self, grouped_grads_and_vars)
    550         device_sizes = [array_ops.size(g) for g, _ in device_grads_and_vars]
    551         # Concat all the flat grads into a big flat tensor.
--> 552         concat_grads = array_ops.concat(flat_grads, 0)
    553 
    554         # Split the big tensor into num_splits packs. In cases where the

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/util/dispatch.py in wrapper(*args, **kwargs)
    178     """Call target, and fall back on dispatchers if there is a TypeError."""
    179     try:
--> 180       return target(*args, **kwargs)
    181     except (TypeError, ValueError):
    182       # Note: convert_to_eager_tensor currently raises a ValueError, not a

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py in concat(values, axis, name)
   1515           dtype=dtypes.int32).get_shape().assert_has_rank(0)
   1516       return identity(values[0], name=name)
-> 1517   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
   1518 
   1519 

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_array_ops.py in concat_v2(values, axis, name)
   1124   _attr_N = len(values)
   1125   _, _, _op, _outputs = _op_def_library._apply_op_helper(
-> 1126         "ConcatV2", values=values, axis=axis, name=name)
   1127   _result = _outputs[:]
   1128   if _execute.must_record_gradient():

~/anaconda3/envs/dcaseF16/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(op_type_name, name, **keywords)
    438                               (prefix, dtype.name))
    439             else:
--> 440               raise TypeError("%s that don't all match." % prefix)
    441           else:
    442             raise TypeError(

TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [float32, float32, float16, float16, float32, float32, float16, float16, float32, float16, float16, float16, float16, float32, float32, float16, float16, float16, float16, float16, float16, float16, float16, float32, float32, float32, float32, float16, float16, float32, float16, float16, float16, float16, float32, float32, float16, float16, float16, float16, float16, float16, float16, float16, float32, float32, float32, float32, float16, float16, float32, float32, float16, float16, float32, float32] that don't all match.

0 个答案:

没有答案