我正在尝试使用tf.distribute.MirroredStrategy()
在多个GPU上运行自定义训练循环。虽然训练循环可以在单个GPU上完美运行,但是当我尝试使用多个GPU时会抛出ValueError: 'handle' is not available outside the replica context or a 'tf.distribute.Strategy.update()' call
。我正在使用tensorflow 1.14和Python 3.7.3。
我在下面提供了一个最小的示例。自定义训练循环在单个GPU上运行没有问题,但是我尝试对多个GPU使用tf.distribute.MirroredStrategy()
时失败,并显示错误消息(完整输出)
ValueError Traceback (most recent call last)
<ipython-input-11-3fda5d330457> in <module>
1 with mirrored_strategy.scope():
----> 2 model, train_op, X1_in, X2_in = create_model_and_train_op()
3 with tf.Session() as sess:
4 sess.run(tf.global_variables_initializer())
5 for sample_ind in range(n_samples):
<ipython-input-7-8f5b3971bbe2> in create_model_and_train_op()
6
7 model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
----> 8 train_op = tf.train.AdamOptimizer().minimize(loss)
9
10 return model, train_op, X1_in, X2_in
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
401 aggregation_method=aggregation_method,
402 colocate_gradients_with_ops=colocate_gradients_with_ops,
--> 403 grad_loss=grad_loss)
404
405 vars_with_grad = [v for g, v in grads_and_vars if g is not None]
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in compute_gradients(self, loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)
510 gate_gradients=(gate_gradients == Optimizer.GATE_OP),
511 aggregation_method=aggregation_method,
--> 512 colocate_gradients_with_ops=colocate_gradients_with_ops)
513 if gate_gradients == Optimizer.GATE_GRAPH:
514 grads = control_flow_ops.tuple(grads)
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_impl.py in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
156 ys, xs, grad_ys, name, colocate_gradients_with_ops,
157 gate_gradients, aggregation_method, stop_gradients,
--> 158 unconnected_gradients)
159 # pylint: enable=protected-access
160
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
595 xs = [
596 x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597 for x in xs
598 ]
599 xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in <listcomp>(.0)
595 xs = [
596 x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597 for x in xs
598 ]
599 xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/distribute/values.py in handle(self)
641 device = distribute_lib.get_update_device()
642 if device is None:
--> 643 raise ValueError("`handle` is not available outside the replica context"
644 " or a `tf.distribute.Strategy.update()` call.")
645 return self.get(device=device).handle
ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.
Google唯一建议的修复程序是更新到tensorflow 2.0.0-beta版。我想知道在1.14中是否也有解决此问题的方法。
这是我尝试过的最小示例:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model
import sys
print (sys.version)
print(tf.__version__)
input_dim = 42
n_samples = 10000
x1_data = np.random.rand(n_samples,input_dim)
x2_data = np.random.rand(n_samples,input_dim)
def create_model_and_train_op():
X1_in = Input(shape=(input_dim,))
X2_in = Input(shape=(input_dim,))
XY = Concatenate(axis=-1)([X1_in,X2_in])
loss = Dense(1)(XY)
model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
# Error message is thrown in the following line if using MirroredStrategy()
train_op = tf.train.AdamOptimizer().minimize(loss)
return model, train_op, X1_in, X2_in
##### Single GPU: Runs without problems
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for sample_ind in range(n_samples):
sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })
##### Multiple GPU: Results in error message
mirrored_strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))
with mirrored_strategy.scope():
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for sample_ind in range(n_samples):
sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })
答案 0 :(得分:0)
我已经解决了仅使用权重负载的问题。 这是Keras manual关于多GPU训练的修改示例。
import tensorflow as tf
from tensorflow import keras
import os
from tensorflow.python.keras.backend import set_session
def get_compiled_model():
# Make a simple 2-layer densely-connected neural network.
global sess
global graph
sess = tf.Session()
graph = tf.get_default_graph()
# a special trick from here https://github.com/tensorflow/tensorflow/issues/28287
# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras!
# Otherwise, their weights will be unavailable in the threads after the session there has been set
set_session(sess)
inputs = keras.Input(shape=(784,))
x = keras.layers.Dense(256, activation="relu")(inputs)
x = keras.layers.Dense(256, activation="relu")(x)
outputs = keras.layers.Dense(10)(x)
model = keras.Model(inputs, outputs)
checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
if checkpoints:
checkpoints.sort()
latest_checkpoint = checkpoints[-1]
latest_checkpoint = checkpoint_dir + "/" + os.path.splitext(os.path.split(checkpoints[-1])[-1])[0]
print("Restoring from", latest_checkpoint)
model.load_weights(latest_checkpoint)
model.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
return model
def get_dataset():
batch_size = 32
num_val_samples = 10000
# Return the MNIST dataset in the form of a `tf.data.Dataset`.
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
# Preprocess the data (these are Numpy arrays)
x_train = x_train.reshape(-1, 784).astype("float32") / 255
x_test = x_test.reshape(-1, 784).astype("float32") / 255
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")
# Reserve num_val_samples samples for validation
x_val = x_train[-num_val_samples:]
y_val = y_train[-num_val_samples:]
x_train = x_train[:-num_val_samples]
y_train = y_train[:-num_val_samples]
return (
tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
)
# Train the model on all available devices.
train_dataset, val_dataset, test_dataset = get_dataset()
# Prepare a directory to store all the checkpoints.
checkpoint_dir = "./ckpt"
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
def make_or_restore_model():
# Either restore the latest model, or create a fresh one
# if there is no checkpoint available.
print("Creating a new model")
return get_compiled_model()
def run_training(epochs=1):
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
print("Number of devices: {}".format(strategy.num_replicas_in_sync))
# Open a strategy scope and create/restore the model
with strategy.scope():
model = make_or_restore_model()
callbacks = [
# This callback saves a SavedModel every epoch
# We include the current epoch in the folder name.
keras.callbacks.ModelCheckpoint(save_weights_only=True,
filepath=checkpoint_dir + "/ckpt-{epoch}.cpkt", save_freq="epoch"
)
]
with graph.as_default():
set_session(sess)
model.fit(
train_dataset,
epochs=epochs,
callbacks=callbacks,
validation_data=val_dataset,
verbose=2,
)
return model
# Running the first time creates the model
model = run_training(epochs=2)
# Test the model on all available devices.
print("Evaluating")
model.evaluate(test_dataset)
run_training(epochs=1)