我的代码可以在GPU上正常工作,但是对于TPU,我从以下位置开始出错:
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
说AttributeError: Tensor.name is meaningless when eager execution is enabled.
我有一个自定义模型,与Keras的默认设置没什么不同
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x = data
y = tf.Variable(tf.constant([1.0], dtype=tf.float32))
with tf.GradientTape() as tape:
y_pred = self(x, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
这是完整的错误消息
Epoch 1/3
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-19-00fb5a641066> in <module>()
5 validation_steps=val_steps,
6 validation_freq=1,
----> 7 callbacks=callbacks)
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
64 def _method_wrapper(self, *args, **kwargs):
65 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 66 return method(self, *args, **kwargs)
67
68 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
846 batch_size=batch_size):
847 callbacks.on_train_batch_begin(step)
--> 848 tmp_logs = train_function(iterator)
849 # Catch OutOfRangeError for Datasets of unknown size.
850 # This blocks until the batch has finished executing.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
578 xla_context.Exit()
579 else:
--> 580 result = self._call(*args, **kwds)
581
582 if tracing_count == self._get_tracing_count():
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
625 # This is the first call of __call__, so we have to initialize.
626 initializers = []
--> 627 self._initialize(args, kwds, add_initializers_to=initializers)
628 finally:
629 # At this point we know that the initialization is complete (or less
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
504 self._concrete_stateful_fn = (
505 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 506 *args, **kwds))
507
508 def invalid_creator_scope(*unused_args, **unused_kwds):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2444 args, kwargs = None, None
2445 with self._lock:
-> 2446 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2447 return graph_function
2448
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2775
2776 self._function_cache.missed.add(call_context_key)
-> 2777 graph_function = self._create_graph_function(args, kwargs)
2778 self._function_cache.primary[cache_key] = graph_function
2779 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2665 arg_names=arg_names,
2666 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2667 capture_by_value=self._capture_by_value),
2668 self._function_attributes,
2669 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
439 # __wrapped__ allows AutoGraph to swap in a converted function. We give
440 # the function a weak reference to itself to avoid a reference cycle.
--> 441 return weak_wrapped_fn().__wrapped__(*args, **kwds)
442 weak_wrapped_fn = weakref.ref(wrapped_fn)
443
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
AttributeError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function *
outputs = self.distribute_strategy.run(
<ipython-input-6-490916a676f3>:18 train_step *
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
/usr/local/lib/python3.6/dist-packages/tensorflow_addons/optimizers/weight_decay_optimizers.py:149 apply_gradients *
return super().apply_gradients(grads_and_vars, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:472 apply_gradients **
grads_and_vars = _filter_grads(grads_and_vars)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 _filter_grads
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 <listcomp>
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1123 name
"Tensor.name is meaningless when eager execution is enabled.")
AttributeError: Tensor.name is meaningless when eager execution is enabled.
完整的代码可以在这里
https://colab.research.google.com/drive/1PqAAa0-Dh9cZfLjLQGuqt5zPWBXqZTn6?usp=sharing
我想知道是否缺少某些TPU培训,因为仅在通过TPU进行培训时才会出现此错误。
以下是有关Github的一些可能的问题
https://github.com/tensorflow/tensorflow/issues/33045
https://github.com/tensorflow/tensorflow/issues/34635
编辑:
我注意到Tensorflow改变了他们对train_step
,https://github.com/tensorflow/tensorflow/blob/2434d2401399e3973d2f704f977bd6ad2d029ca7/tensorflow/python/keras/engine/training.py#L716的定义
所以我更新了自定义模型以匹配它。
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
def _minimize(strategy, tape, optimizer, loss, trainable_variables):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, trainable_variables)
gradients = [(ClipIfNotNone(grad)) for grad in gradients]
gradients = [(ClipIfNotNone2(grad)) for grad in gradients]
# Whether to aggregate gradients outside of optimizer. This requires support
# of the optimizer and doesn't work with ParameterServerStrategy and
# CentralStroageStrategy.
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and # pylint: disable=protected-access
not isinstance(strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
# We aggregate gradients before unscaling them, in case a subclass of
# LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
# done on scaled gradients, not unscaled gradients, for numeric stability.
gradients = optimizer._aggregate_gradients(zip(gradients, # pylint: disable=protected-access
trainable_variables))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients) # pylint: disable=protected-access
if trainable_variables:
if aggregate_grads_outside_optimizer:
optimizer.apply_gradients(
zip(gradients, trainable_variables),
experimental_aggregate_gradients=False)
else:
optimizer.apply_gradients(zip(gradients, trainable_variables))
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x = data
y = tf.constant([1.0], dtype=tf.float32)
sample_weight = None
with tf.GradientTape() as tape:
y_pred = self(x, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
self.trainable_variables)
self.compiled_metrics.update_state(y, y_pred, sample_weight)
return {m.name: m.result() for m in self.metrics}
但是,结果几乎相同
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function *
outputs = self.distribute_strategy.run(
<ipython-input-8-823751185253>:53 train_step *
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
<ipython-input-8-823751185253>:24 _minimize *
gradients = optimizer._aggregate_gradients(zip(gradients, # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:521 _aggregate_gradients **
filtered_grads_and_vars = _filter_grads(grads_and_vars)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 _filter_grads
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 <listcomp>
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1123 name
"Tensor.name is meaningless when eager execution is enabled.")
AttributeError: Tensor.name is meaningless when eager execution is enabled.
Edit2:
我尝试不完全创建自定义train_step
,而只是扩展tf.keras.Model类。仍然遇到相同的问题。
这是我的自定义模型的样子
class Dora_A(tf.keras.Model):
def __init__(self):
super(Dora_A, self).__init__()
self.bioRoberta = TFRobertaModel.from_pretrained('allenai/biomed_roberta_base', from_pt=True)
self.Q_Tlayer0 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.Q_Tlayer0._name = self.Q_Tlayer0._name + 'Query'
self.P_Tlayer0 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.P_Tlayer0._name = self.P_Tlayer0._name + 'Passage'
self.Q_Tlayer1 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.Q_Tlayer1._name = self.Q_Tlayer1._name + 'Query'
self.P_Tlayer1 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.P_Tlayer1._name = self.P_Tlayer1._name + 'Passage'
self.Q_Tlayer2 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.Q_Tlayer2._name = self.Q_Tlayer2._name + 'Query'
self.P_Tlayer2 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.P_Tlayer2._name = self.P_Tlayer2._name + 'Passage'
self.Q_Tlayer3 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.Q_Tlayer3._name = self.Q_Tlayer3._name + 'Query'
self.P_Tlayer3 = deepcopy(self.bioRoberta.layers[0].encoder.layer[11])
self.P_Tlayer3._name = self.P_Tlayer3._name + 'Passage'
self.Q_Tlayer3.intermediate.intermediate_act_fn = tf.keras.activations.tanh
self.P_Tlayer3.intermediate.intermediate_act_fn = tf.keras.activations.tanh
# self.Q_Tlayer0.set_weights(self.Q_Tlayer3.get_weights())
# self.P_Tlayer0.set_weights(self.P_Tlayer3.get_weights())
# self.Q_Tlayer1.set_weights(self.Q_Tlayer3.get_weights())
# self.P_Tlayer1.set_weights(self.P_Tlayer3.get_weights())
# self.Q_Tlayer2.set_weights(self.Q_Tlayer3.get_weights())
# self.P_Tlayer2.set_weights(self.P_Tlayer3.get_weights())
self.Q_ff_1 = tf.keras.layers.Dense(768, activation='swish', name='qffPost_n1')
self.P_ff_1 = tf.keras.layers.Dense(768, activation='swish', name='pffPost_n1')
self.Q_ff_2 = tf.keras.layers.Dense(768, activation='tanh', name='qffPost_n2')
self.P_ff_2 = tf.keras.layers.Dense(768, activation='tanh', name='pffPost_n2')
def call(self, inputIds):
queryInputs, passageInputs = inputIds
Q_outputs = self.bioRoberta(queryInputs)[0]
P_outputs = self.bioRoberta(passageInputs)[0]
Q_outputs = self.Q_Tlayer0((Q_outputs, None, None))[0]
P_outputs = self.P_Tlayer0((P_outputs, None, None))[0]
Q_outputs = self.Q_Tlayer1((Q_outputs, None, None))[0]
P_outputs = self.P_Tlayer1((P_outputs, None, None))[0]
Q_outputs = self.Q_Tlayer2((Q_outputs, None, None))[0]
P_outputs = self.P_Tlayer2((P_outputs, None, None))[0]
Q_outputs = self.Q_Tlayer3((Q_outputs, None, None))[0]
P_outputs = self.P_Tlayer3((P_outputs, None, None))[0]
Q_outputs = tf.concat([
Q_outputs[:, 0], #cls, NOT from ff layer after last hidden state since it seems to be untrained in roberta
tf.reduce_mean(Q_outputs[:, 1:-1], axis=1), # pooled except CLS and SEP
tf.math.reduce_max(Q_outputs[:, 1:-1], axis=1),
tf.math.reduce_min(Q_outputs[:, 1:-1], axis=1),
tf.math.reduce_variance(Q_outputs[:, 1:-1], axis=1),
tf.math.reduce_logsumexp(Q_outputs[:, 1:-1], axis=1),
Q_outputs[:, -1] # sep, get from hidden state
],axis=1)
P_outputs = tf.concat([
P_outputs[:, 0], #cls, NOT from ff layer after last hidden state since it seems to be untrained in roberta
tf.reduce_mean(P_outputs[:, 1:-1], axis=1), # pooled except CLS and SEP
tf.math.reduce_max(P_outputs[:, 1:-1], axis=1),
tf.math.reduce_min(P_outputs[:, 1:-1], axis=1),
tf.math.reduce_variance(P_outputs[:, 1:-1], axis=1),
tf.math.reduce_logsumexp(P_outputs[:, 1:-1], axis=1),
P_outputs[:, -1] # sep, get from hidden state
],axis=1)
Q_outputs = Dropout(0.10)(Q_outputs)
P_outputs = Dropout(0.10)(P_outputs)
Q_outputs = self.Q_ff_1(Q_outputs)
P_outputs = self.P_ff_1(P_outputs)
Q_outputs = self.Q_ff_2(Q_outputs)
P_outputs = self.P_ff_2(P_outputs)
dotProductMatrix = tf.linalg.matmul(Q_outputs, P_outputs, transpose_b=True, name='mm')
return dotProductMatrix
这是我接受培训的错误消息
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-23-d78edec93dcb> in <module>()
1 model.fit(train_datasetFinal,
2 epochs=epochs,
----> 3 callbacks=callbacks)
4
5 # else:
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
64 def _method_wrapper(self, *args, **kwargs):
65 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 66 return method(self, *args, **kwargs)
67
68 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
846 batch_size=batch_size):
847 callbacks.on_train_batch_begin(step)
--> 848 tmp_logs = train_function(iterator)
849 # Catch OutOfRangeError for Datasets of unknown size.
850 # This blocks until the batch has finished executing.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
578 xla_context.Exit()
579 else:
--> 580 result = self._call(*args, **kwds)
581
582 if tracing_count == self._get_tracing_count():
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
625 # This is the first call of __call__, so we have to initialize.
626 initializers = []
--> 627 self._initialize(args, kwds, add_initializers_to=initializers)
628 finally:
629 # At this point we know that the initialization is complete (or less
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
504 self._concrete_stateful_fn = (
505 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 506 *args, **kwds))
507
508 def invalid_creator_scope(*unused_args, **unused_kwds):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2444 args, kwargs = None, None
2445 with self._lock:
-> 2446 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2447 return graph_function
2448
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2775
2776 self._function_cache.missed.add(call_context_key)
-> 2777 graph_function = self._create_graph_function(args, kwargs)
2778 self._function_cache.primary[cache_key] = graph_function
2779 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2665 arg_names=arg_names,
2666 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2667 capture_by_value=self._capture_by_value),
2668 self._function_attributes,
2669 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
439 # __wrapped__ allows AutoGraph to swap in a converted function. We give
440 # the function a weak reference to itself to avoid a reference cycle.
--> 441 return weak_wrapped_fn().__wrapped__(*args, **kwds)
442 weak_wrapped_fn = weakref.ref(wrapped_fn)
443
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
AttributeError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function *
outputs = self.distribute_strategy.run(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/tpu_strategy.py:174 run **
return self.extended.tpu_run(fn, args, kwargs, options)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/tpu_strategy.py:867 tpu_run
return func(args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/tpu_strategy.py:934 tpu_function
padding_spec=padding_spec)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/tpu/tpu.py:893 replicate
padding_spec=padding_spec)[1]
/usr/local/lib/python3.6/dist-packages/tensorflow/python/tpu/tpu.py:1280 split_compile_and_replicate
outputs = computation(*computation_inputs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/tpu_strategy.py:896 replicated_fn
result[0] = fn(*replica_args, **replica_kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:541 train_step **
self.trainable_variables)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:1804 _minimize
trainable_variables))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:521 _aggregate_gradients
filtered_grads_and_vars = _filter_grads(grads_and_vars)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 _filter_grads
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1223 <listcomp>
([v.name for v in vars_with_empty_grads]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1123 name
"Tensor.name is meaningless when eager execution is enabled.")
AttributeError: Tensor.name is meaningless when eager execution is enabled.