在Nvidia GTX1080ti上运行以下代码时,如果FLOAT_DTYPE = tf.float16,则运行时间为0.61s,如果FLOAT_DTYPE = tf.float32,则时间为0.26s。时间轴显示scatter_sub占用了很大一部分时间。为什么float16比float32慢得多? Tensorflow版本是1.8。
import time
import tensorflow as tf
FLOAT_DTYPE = tf.float16
var = tf.get_variable("embedding", initializer = tf.ones(shape = [1000000, 5], dtype = FLOAT_DTYPE), dtype = FLOAT_DTYPE)
embeddings = tf.nn.embedding_lookup(var, [i for i in range(100000)])
w = tf.get_variable("w", shape = [5, 1], dtype = FLOAT_DTYPE)
z = tf.matmul(embeddings, w)
opt = tf.train.AdamOptimizer(1e-3, epsilon = 1e-4)
g = tf.gradients(z, xs = [var])
train = opt.apply_gradients([(g[0], var)])
with tf.Session() as sess:
tf.global_variables_initializer().run()
start = time.time()
for _ in range(10):
sess.run(train)
elapsed = time.time() - start
print 'Time: %f' % elapsed