我尝试对CudnnLSTM API进行keras和tensorflow之间的实验。我惊讶地发现,基于相同的数据集和相似的模型架构,使用此API的tf模型与keras版本相比运行速度要慢得多。
首先,我在这里模拟了一些数据集(二进制分类,特征是具有10个时间步长和300个嵌入维的时间序列):
## the simulated dataset
total_n = 512000
train_X = np.random.normal(0, 1, (total_n, 10, 300))
train_y = (np.random.normal(0, 1, (total_n, 1)) > 0).astype(np.int32)
batch_size = 1024
我的张量流图(一个简单的LSTM,然后是maxpooling):
import tensorflow as tf
import numpy as np
from contextlib import contextmanager
import time
@contextmanager
def timer(name):
t0 = time.time()
yield
print(f'[{name}] done in {time.time() - t0:.0f} s')
class MyGraph():
def __init__(self):
self.graph = tf.Graph()
with self.graph.as_default():
self.input_x = tf.placeholder(tf.float32, shape=(None, 10, 300))
self.input_y = tf.placeholder(tf.float32, shape=(None, 1))
self.gru = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=1, \
num_units=40, \
direction='bidirectional')
self.first_RNN, _ = self.gru(self.input_x)
self.max_pool = tf.reduce_max(self.first_RNN, 1)
self.logits = tf.layers.dense(self.max_pool, 1, kernel_initializer=tf.glorot_uniform_initializer())
# Loss
self.loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = self.input_y, logits = self.logits)
self.final_loss = tf.reduce_mean(self.loss)
# Training Scheme
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.optimizer = tf.train.AdamOptimizer()
self.train_op = self.optimizer.minimize(self.final_loss, global_step=self.global_step)
使用feed-dict的数据集训练模型:
tf.reset_default_graph()
g = MyGraph(); print("Graph loaded")
with tf.Session(graph=g.graph) as sess:
with timer('done one epoch'):
sess.run(tf.global_variables_initializer())
for step in range(int(total_n/batch_size)):
batch_x = train_X[step*batch_size:(step+1)*batch_size]
batch_y = train_y[step*batch_size:(step+1)*batch_size]
sess.run(g.train_op, feed_dict = {g.input_x: batch_x, g.input_y: batch_y})
print('Final step index: ', step)
输出为:
Graph loaded
Final step index: 499
[done one epoch] done in 48 s
然后我对keras进行了第二次实验:
def model_lstm_atten():
K.clear_session()
inp = Input(shape=(10,300,))
x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(inp)
max_pool = GlobalMaxPooling1D()(x)
outp = Dense(1, activation="sigmoid")(max_pool)
model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer='adam')
return model
model = model_lstm_atten()
with timer('done one epoch'):
model.fit(train_X, train_y, batch_size=1024, epochs=1, callbacks=None, verbose=0)
输出为:
[done one epoch] done in 15 s
为了排除TF数据馈送瓶颈的原因,我在这里做了另一次尝试(基本上,我将一个小批量数据嵌入到图形中,这样我就不必使用feed-dict进行训练了。并不是真正的训练,而是展示速度):
class MyGraph1():
def __init__(self, data):
self.graph = tf.Graph()
with self.graph.as_default():
self.input_x, self.input_y = data[0].astype(np.float32), data[1].astype(np.float32)
self.input_x = tf.convert_to_tensor(self.input_x)
self.input_y = tf.convert_to_tensor(self.input_y)
self.gru = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=1, \
num_units=40, \
direction='bidirectional')
self.first_RNN, _ = self.gru(self.input_x)
self.max_pool = tf.reduce_max(self.first_RNN, 1)
self.logits = tf.layers.dense(self.max_pool, 1, kernel_initializer=tf.glorot_uniform_initializer())
# Loss
self.loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = self.input_y, logits = self.logits)
self.final_loss = tf.reduce_mean(self.loss)
# Training Scheme
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.optimizer = tf.train.AdamOptimizer()
self.train_op = self.optimizer.minimize(self.final_loss, global_step=self.global_step)
只需使用一个小批量数据进行训练:
batch_x = train_X[:batch_size]
batch_y = train_y[:batch_size]
tf.reset_default_graph()
g1 = MyGraph1((batch_x, batch_y)); print("Graph loaded")
with tf.Session(graph=g1.graph) as sess:
with timer('done one epoch'):
sess.run(tf.global_variables_initializer())
for step in range(int(total_n/batch_size)):
sess.run(g1.train_op)
print('Final step index: ', step)
同样,输出为:
Graph loaded
Final step index: 499
[done one epoch] done in 39 s
仍然比keras版本慢。
我的tensorflow和keras版本是:
1.10.0 - tf
2.2.2 - keras
那为什么tf比keras慢?