当我尝试在GPU上使用tf.matmul函数时,我收到以下错误:
InternalError: Blas xGEMMBatched launch failed
如果函数N
中的calc()
值设置为小于15的值,则可以正常工作。
我正在运行tensorflow 1.8.0和Cuda V9.1.85。 GPU上只有一个Python进程,没有其他开放会话。此外,我还有足够的GPU内存(see attached image)。
更改CUDA_VISIBLE_DEVICES
值不会显示任何效果。更改ConfigProto()
设置也无济于事。使用tf.matmul
也无法解决问题。
这是我正在运行的代码:
import tensorflow as tf
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
tf.Session(config=config).close()
def calc():
N = 15 # works for N <= 14
a = 16
b = 8
X = np.random.rand(N, 11520, b, 1).astype(np.float32)
print(X.nbytes*1e-6, "MB")
W = np.random.rand(N, 11520, a, b).astype(np.float32)
print(W.nbytes*1e-6, "MB")
X_ = tf.constant(X, name="X-constant", dtype=tf.float32)
W_ = tf.constant(W, name="W-constant", dtype=tf.float32)
# return tf.matmul(W_, X_, name="mymatmul")
return W_ @ X_
tf.reset_default_graph()
a = calc()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
b = sess.run(a)
sess.close()
print(b.shape)
这是我得到的输出:
5.529599999999999 MB
88.47359999999999 MB
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1321 try:
-> 1322 return fn(*args)
1323 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1306 return self._call_tf_sessionrun(
-> 1307 options, feed_dict, fetch_list, target_list, run_metadata)
1308
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1408 self._session, options, feed_dict, fetch_list, target_list,
-> 1409 run_metadata)
1410 else:
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
During handling of the above exception, another exception occurred:
InternalError Traceback (most recent call last)
<ipython-input-5-013153235a1a> in <module>()
3 sess = tf.Session()
4 sess.run(tf.global_variables_initializer())
----> 5 b = sess.run(a)
6 sess.close()
7 print(b.shape)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
898 try:
899 result = self._run(None, fetches, feed_dict, options_ptr,
--> 900 run_metadata_ptr)
901 if run_metadata:
902 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1133 if final_fetches or final_targets or (handle and feed_dict_tensor):
1134 results = self._do_run(handle, final_targets, final_fetches,
-> 1135 feed_dict_tensor, options, run_metadata)
1136 else:
1137 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1314 if handle is None:
1315 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1316 run_metadata)
1317 else:
1318 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 except KeyError:
1334 pass
-> 1335 raise type(e)(node_def, op, message)
1336
1337 def _extend_graph(self):
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
Caused by op 'matmul', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 486, in start
self.io_loop.start()
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
self._run_once()
File "/usr/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
handle._run()
File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 117, in _handle_events
handler_func(fileobj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-013153235a1a>", line 2, in <module>
a = calc()
File "<ipython-input-4-bf0e6012e9e2>", line 13, in calc
return W_ @ X_
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 847, in binary_op_wrapper
return func(x, y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 1976, in matmul
a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1236, in batch_mat_mul
"BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1740, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InternalError (see above for traceback): Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
答案 0 :(得分:1)
以下是将tf.matmul
替换为tf.einsum
的解决方法。但是,您的代码可以在我的机器上使用NVIDIA 840M(2004 MiB RAM),cudnn 7.0.5.15和cuda 9.0.176(可能降级有帮助吗?)。
import tensorflow as tf
import numpy as np
sess = tf.Session()
N = 20
M = 11520
a = 16
b = 8
W = np.random.rand(N, M, a, b).astype(np.float32)
X = np.random.rand(N, M, b, 1).astype(np.float32)
# tf.einsum does not support numpy arrays, so wrap W and X in tf.constants
W2 = tf.constant(W)
X2 = tf.constant(X)
# tf.einsum does not support "..." as seen later in np.einsum
WX = tf.einsum("uvik,uvkj->uvij", W2, X2)
# same as:
#WX = tf.matmul(W2, X2)
# calculate W@X using tf.einsum
result1 = sess.run(WX)
# calculate W@X using np.einsum
result2 = np.einsum("...ik,...kj->...ij", W, X)
# calculate W@X by hand (just for illustrative purpose, too slow for practical use)
result3 = np.zeros((N, M, a, 1), dtype=np.float32)
for i in range(a):
for j in range(1):
for k in range(b):
result3[..., i, j] += W[..., i, k] * X[..., k, j]
# ensure that everything is correct
assert(np.allclose(result1, result2))
assert(np.allclose(result1, result3))
print("everything ok")
sess.close()