在TensorFlow分布式MultiWorkerMirroredStrategy培训期间,我在批量归一化期间遇到错误。
对于模型构建,我正在使用批量归一化。
使用批处理规范化,我仅在主节点上收到以下错误。
如果删除了批量标准化,则代码将完成。
def _model_fn(features, labels, mode, params):
with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
layer_1 = tf.keras.layers.LSTM(64,return_sequences=True)
LSTM_output = layer_1(features)
d = tf.keras.layers.Dense(32)(LSTM_output)
d = tf.keras.layers.Activation('relu')(d)
d = tf.keras.layers.BatchNormalization()(
d, training= mode == tf.estimator.ModeKeys.TRAIN)
d = tf.keras.layers.Dense(params['label_size'])(d)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/distribute/estimator_training.py", line 290, in train_and_evaluate
session_config=run_config.session_config)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/distribute/distribute_coordinator.py", line 853, in run_distribute_coordinator
task_id, session_config, rpc_layer)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/distribute/distribute_coordinator.py", line 360, in _run_single_worker
return worker_fn(strategy)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/distribute/estimator_training.py", line 252, in _worker_fn
hooks=hooks)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1156, in _train_model
return self._train_model_distributed(input_fn, hooks, saving_listeners)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1219, in _train_model_distributed
self._config._train_distribute, input_fn, hooks, saving_listeners)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1329, in _actual_train_model_distributed
saving_listeners)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1406, in _train_with_estimator_spec
estimator_spec, worker_hooks, saving_listeners)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1352, in _train_with_estimator_spec_distributed
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1252, in run
run_metadata=run_metadata)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1353, in run
raise six.reraise(*original_exc_info)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1338, in run
return self._sess.run(*args, **kwargs)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1411, in run
run_metadata=run_metadata)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1169, in run
return self._sess.run(*args, **kwargs)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/home/ab981s/anaconda3/envs/py2tensorflow_nightly/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: From /job:worker/replica:0/task:0:
Upper bound check fail for input 10 from node geolocation_model/gradients/geolocation_model/batch_normalization/batchnorm/add_1_grad/Reshape_1 to node scoped_allocator_concat_1_15 input bounds = [0x7f035f94a200, 0x7f035f94a280] backing_tensor bounds = [0x7f035ee22900, 0x7f035eebe7c8]
[[{{node scoped_allocator_concat_1_15}}]]