我想在多个GPU上运行Bi-LSTM,但是当我使用“ with tf.deivce(“ / gpu:0”)”语句运行代码时: Bi-LSTM不能在多GPU上运行?我不怎么解决问题。有人可以帮助解决吗? 以下是我的代码的一部分:
with tf.Session(config=sess_config) as sess:
with tf.variable_scope(tf.get_variable_scope()):
for i in range(args.n_gpus):
with tf.device("/gpu:%d" % gpu_avaiables[i]):
with tf.name_scope("tower_%d" % gpu_avaiables[i]):
_x = x[i * batch_size:(i + 1) * batch_size]
_q = q[i * batch_size:(i + 1) * batch_size]
_x_length = x_length[i * batch_size:(i + 1) * batch_size]
_q_length = q_length[i * batch_size:(i + 1) * batch_size]
_start = start[i * batch_size:(i + 1) * batch_size]
_end = end[i * batch_size:(i + 1) * batch_size]
_x_emb = x_emb[i * batch_size:(i + 1) * batch_size]
_q_emb = q_emb[i * batch_size:(i + 1) * batch_size]
data = (_x, _q, _x_length, _q_length, _start, _end, _x_emb, _q_emb, dropout_keep_prob)
model = BiDAFModel_ngpus(args, data)
tf.get_variable_scope().reuse_variables()
model_loss = model.loss
p1, p2 = model.p1, model.p2
grads = opt.compute_gradients(model_loss)
tower_grads.append(grads)
grads = average_gradients(tower_grads)
train_op = opt.apply_gradients(grads)
,错误是:
Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:3b:00.0, compute capability: 6.1
/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:af:00.0, compute capability: 6.1
/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: GeForce GTX 1080 Ti, pci bus id: 0000:d8:00.0, compute capability: 6.1
Traceback (most recent call last):
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
return fn(*args)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1317, in _run_fn
self._extend_graph()
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1352, in _extend_graph
tf_session.ExtendSession(self._session)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert: Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Registered kernels:
device='CPU'
[[{{node tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert}} = Assert[T=[DT_STRING, DT_INT32, DT_STRING, DT_INT32], summarize=3, _device="/device:GPU:0"](tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/All, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_0, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/stack, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_2, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Shape_1)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "run.py", line 298, in <module>
run()
File "run.py", line 293, in run
multi_gpu_train(args)
File "run.py", line 162, in multi_gpu_train
training(args, data, vocab)
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 162, in training
model = BiDAFModel_ngpus(args, data)
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 37, in __init__
self.sess.run(tf.global_variables_initializer())
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot assign a device for operation tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert: Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Registered kernels:
device='CPU'
[[node tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert (defined at /home/home1/dmyan/codes/BIDAF/layers/rnn.py:9) = Assert[T=[DT_STRING, DT_INT32, DT_STRING, DT_INT32], summarize=3, _device="/device:GPU:0"](tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/All, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_0, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/stack, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_2, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Shape_1)]]
Caused by op 'tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert', defined at:
File "run.py", line 298, in <module>
run()
File "run.py", line 293, in run
multi_gpu_train(args)
File "run.py", line 162, in multi_gpu_train
training(args, data, vocab)
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 162, in training
model = BiDAFModel_ngpus(args, data)
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 35, in __init__
self.bulid_graph()
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 45, in bulid_graph
self.contextual_embedding()
File "/home/home1/dmyan/codes/BIDAF/model_multi_gpu.py", line 57, in contextual_embedding
self.h = rnn(self.x_embed, self.hidden_size, self.x_length)
File "/home/home1/dmyan/codes/BIDAF/layers/rnn.py", line 9, in rnn
cell_fw, cell_bw, inputs, sequence_length=length, dtype=tf.float32)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py", line 439, in bidirectional_dynamic_rnn
time_major=time_major, scope=fw_scope)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py", line 651, in dynamic_rnn
[_assert_has_shape(sequence_length, [batch_size])]):
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py", line 646, in _assert_has_shape
packed_shape, " but saw shape: ", x_shape])
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py", line 189, in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 159, in Assert
return gen_logging_ops._assert(condition, data, summarize, name="Assert")
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 52, in _assert
name=name)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/home/home1/dmyan/.conda/envs/py_env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): Cannot assign a device for operation tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert: Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Registered kernels:
device='CPU'
[[node tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert (defined at /home/home1/dmyan/codes/BIDAF/layers/rnn.py:9) = Assert[T=[DT_STRING, DT_INT32, DT_STRING, DT_INT32], summarize=3, _device="/device:GPU:0"](tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/All, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_0, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/stack, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Assert/Assert/data_2, tower_0/paragraph_encoding/bidirectional_rnn/fw/fw/Shape_1)]]
我将非常感谢那些帮助我解决问题的人!