我观察到在我的机器上tf.matmul中的tensorflow运行速度明显慢于numpy中的dot产品。我有GTX 1080 GPU,并且期望tf.matmul至少与使用CPU(numpy)运行代码一样快。
环境信息
操作系统
lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 16.10
Release: 16.10
Codename: yakkety
已安装的CUDA和cuDNN版本:
ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root root 556000 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root root 16 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root root 19 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root root 415432 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root root 775162 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users 13 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users 18 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a
TensorFlow设置
python -c "import tensorflow; print(tensorflow.__version__)"
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
1.0.0
代码:
'''
Created on Sep 28, 2017
@author: voldemaro
Running on I7/GTX 1080
no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD min: 3956.20, median: 4127.75, mean: 4264.41
TF CPU SVD min: 5926.43, median: 5951.70, mean: 5961.43
TF GPU SVD min: 5917.10, median: 6015.87, mean: 6039.63
numpy default .dot product min: 5816.97, median: 5933.43, mean: 5965.22
TF CPU matmul min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul min: 22026.52, median: 22109.97, mean: 22199.43
'''
from scipy import linalg; # for svd
import numpy as np;
import os;
import sys;
import time;
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" # nospam
import tensorflow as tf;
import gc; gc.disable();
NUM_RUNS = 5;
dtype = np.float32;
N=2048;
M = 16920;
def get_tensorflow_version_url():
import tensorflow as tf
version=tf.__version__
commit = tf.__git_version__
# commit looks like this
# 'v1.0.0-65-g4763edf-dirty'
commit = commit.replace("'","")
if commit.endswith('-dirty'):
dirty = True
commit = commit[:-len('-dirty')]
commit=commit.rsplit('-g', 1)[1]
url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
return url
def get_mkl_version():
import ctypes
import numpy as np
ver = np.zeros(199, dtype=np.uint8)
mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
return ver[ver != 0].tostring()
timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);
def benchmark(message, func):
time_list = []
for i in range(NUM_RUNS):
start_time = time.time();
func();
time_list.append(time.time()-start_time);
time_list = 1000*np.array(time_list); # get seconds, convert to ms
if len(time_list)>0:
min = np.min(time_list);
median = np.median(time_list);
formatted = ["%.2f"%(d,) for d in time_list[:10]];
result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
else:
result = "empty"
print("%-20s %s"%(message, result))
if np.__config__.get_info("lapack_mkl_info"):
print("MKL version", get_mkl_version())
else:
print("no MKL")
print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())
svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);
init_OP = tf.global_variables_initializer();
with tf.device("/gpu:0"):
init_holder_gpu = tf.placeholder(dtype, shape=(M,M));
specVarGPU = tf.random_uniform((N,N), dtype=dtype);
S_gpu = tf.random_uniform((M,N), dtype=dtype);
V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
[D2_gpu, E1_gpu, E2_gpu] = tf.svd(specVarGPU);
with tf.device("/cpu:0"):
init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
specVarCPU = tf.random_uniform((N,N), dtype=dtype);
S_cpu = tf.random_uniform((M,N), dtype=dtype);
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));
[D2_cpu, E1_cpu, E2_cpu] = tf.svd(specVarCPU);
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));
print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));
def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)
config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
sess.run(init_OP);
def func2(): sess.run([D2_cpu.op, E1_cpu.op, E2_cpu.op]);
benchmark("TF CPU SVD", func2);
def func3(): sess.run([D2_gpu.op, E1_gpu.op, E2_gpu.op]);
benchmark("TF GPU SVD", func3);
def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)
def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)
def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)
答案 0 :(得分:0)
显然,张量流不会优化&#34;嵌套&#34;操作,所以 tf.matmul(tf.transpose(tf.conj(a)),x)比b = tf.conj(a),c = tf.transpose(b)和d = tf.matmul(c, x)的
答案 1 :(得分:0)
对于SVD,问题是SVD还没有GPU内核。见这里:https://github.com/tensorflow/tensorflow/issues/11588
这意味着必须在CPU上计算SVD,即使在GPU上实例化张量也是如此。出于这个原因,有一个开销用于将数据从GPU传输到CPU进行计算,然后返回GPU以存储结果。
对于GPU上的matmul,问题出现在你的bechmarking代码的最后一行:你不是再次调用func5而是调用func4,所以你正在对TF CPU matmul进行基准测试。
除此之外,您还可以在代码中查看其他一些内容:
init_holder_cpu
和init_holder_gpu
vars,因为您不使用它们V_cpu
,因此您可以在测试中有效地运行SVD和matmul 稍微清理过的代码版本如下:
# ... above is the same
print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())
svd_array = np.random.random_sample((N,N)).astype(dtype)
another_array = np.random.random_sample((M,N)).astype(dtype)
with tf.device("/gpu:0"):
specVarGPU = tf.random_uniform((N, N), dtype=dtype)
S_gpu = tf.random_uniform((M, N), dtype=dtype)
V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu))
D2_gpu, E1_gpu, E2_gpu = tf.svd(specVarGPU)
with tf.device("/cpu:0"):
specVarCPU = tf.random_uniform((N,N), dtype=dtype)
S_cpu = tf.random_uniform((M,N), dtype=dtype)
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu))
D2_cpu, E1_cpu, E2_cpu = tf.svd(specVarCPU)
config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
def V_numpy():
np.matmul(np.matmul(np.transpose(np.transpose(np.conj(another_array))), svd_array, ), np.transpose(another_array))
with tf.Session(config = config) as sess:
print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype))
benchmark("numpy default SVD", lambda: linalg.svd(svd_array))
benchmark("TF CPU SVD", lambda: sess.run([D2_cpu.op, E1_cpu.op, E2_cpu.op]))
benchmark("TF GPU SVD", lambda: sess.run([D2_gpu.op, E1_gpu.op, E2_gpu.op]))
benchmark("numpy MKL matmul", V_numpy)
benchmark("TF CPU matmul", lambda: sess.run([V_cpu.op]))
benchmark("TF GPU matmul", lambda: sess.run([V_gpu.op]))
输出(在i7和GTX 1070上)
MKL version b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications'
TF version: v1.4.0-rc1-11-g130a514
TF url: https://github.com/tensorflow/tensorflow/commit/130a514
Timing in ms for 2048 x 2048 SVD of type <class 'numpy.float32'> and matmul for 16920 x 2048 of type <class 'numpy.float32'>
numpy default SVD min: 3318.42, median: 3320.40, mean: 3320.40
TF CPU SVD min: 4576.71, median: 4577.02, mean: 4577.02
TF GPU SVD min: 14022.59, median: 14172.69, mean: 14172.69
numpy MKL matmul min: 4500.33, median: 4628.01, mean: 4628.01
TF CPU matmul min: 15420.19, median: 15664.84, mean: 15664.84
TF GPU matmul min: 277.80, median: 282.54, mean: 282.54
您可以看到matmul
的GPU版本比任何CPU实现都快得多,正如预期的那样。