我正在尝试对一些TF2 keras代码进行基准测试-具体来说,是将JIT编译性能与非JIT比较。 tf.test.Benchmark
在不使用JIT的情况下给出了合理的外观结果-与nvidia-smi
输出相比,内存使用情况大致一致,时间非常接近model.fit
-但JITed版本报告的内存使用量很小(<1Mb,而2.2 Gb(不包含JIT),并且时间比model.fit
期间的时间始终短30%。
下面提供的代码。我有3个主要问题:
model.fit
之间的速度差异的根源是什么?tf.compat.v1.data.make_one_shot_iterator
,但是肯定有一种使用@tf.function
的方法吗?是否有非TF工具可以更好地做到这一点?from absl import logging
import tensorflow as tf
import tensorflow_datasets as tfds
ALLOW_GROWTH = False # switch to this to use nvidia-smi
JIT = True
TFDS_NAME = 'mnist'
SHAPE = (28, 28, 1)
BATCH_SIZE = 64
NUM_CLASSES = 10
NUM_LAYERS = 20
UNITS = 4096
TRY_GCS = False # switch this if running on colab
TRAIN_STEPS = 200
BURN_ITERS = 200
MIN_ITERS = 200
def model_fn(inp):
layers = tf.keras.layers
x = layers.Flatten()(inp)
for _ in range(NUM_LAYERS):
x = layers.Dense(UNITS)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
logits = layers.Dense(NUM_CLASSES)(x)
model = tf.keras.Model(inp, logits)
model.compile(
optimizer=tf.keras.optimizers.SGD(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
return model
def get_dataset():
return tfds.load(
TFDS_NAME,
split='train',
as_supervised=True,
in_memory=True,
try_gcs=TRY_GCS).repeat().shuffle(1024).map(
lambda image, label: (tf.cast(image, tf.float32) / 255, label),
tf.data.experimental.AUTOTUNE).batch(BATCH_SIZE).prefetch(
tf.data.experimental.AUTOTUNE)
def fit(epochs=2, steps_per_epoch=TRAIN_STEPS):
dataset = get_dataset()
model = model_fn(tf.keras.Input(shape=SHAPE, dtype=tf.float32))
model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=epochs)
def benchmark(burn_iters=BURN_ITERS, min_iters=MIN_ITERS):
with tf.Graph().as_default():
dataset = get_dataset()
image, labels = tf.compat.v1.data.make_one_shot_iterator(
dataset).get_next()
model = model_fn(tf.keras.Input(tensor=image))
logits, = model.outputs
optimizer = model.optimizer
weights = model.weights
loss = model.loss(labels, logits)
grads = optimizer.get_gradients(loss, weights)
grads_and_vars = tuple(
(g, v) for g, v in zip(grads, weights) if g is not None)
op = optimizer.apply_gradients(grads_and_vars)
op = tf.group((op,) + tuple(model.updates)) # <---
bm = tf.test.Benchmark()
with tf.compat.v1.Session() as sess:
logging.info('Initializing variables...')
variables = model.weights + optimizer.weights
for name in ('learning_rate', 'momentum'):
a = getattr(optimizer, name, None)
if isinstance(a, tf.Variable):
variables.append(a)
sess.run([v.initializer for v in variables])
logging.info('Starting benchmarking...')
result = bm.run_op_benchmark(sess,
op,
burn_iters=burn_iters,
min_iters=min_iters)
logging.info('Wall time (ms): {}'.format(result['wall_time'] *
1000))
gpu_mem = result['extras'].get(
'allocator_maximum_num_bytes_GPU_0_bfc', 0)
logging.info('Memory (Mb): {}'.format(gpu_mem / 1024**2))
logging.set_verbosity(logging.INFO)
tf.config.optimizer.set_jit(JIT)
for device in tf.config.experimental.get_visible_devices('GPU'):
tf.config.experimental.set_memory_growth(device, ALLOW_GROWTH)
benchmark()
fit()
答案 0 :(得分:0)
关于问题2,似乎keras模型构造了优化的图,但是如果模型本身不是以图模式构建的,则它们似乎没有利用JIT编译的优势。通过以图模式构建模型(即
),我设法在基准测试和fit
之间获得了大致相同的时间
def fit(epochs=2, steps_per_epoch=TRAIN_STEPS):
with tf.Graph().as_default():
dataset = get_dataset()
model = model_fn(tf.keras.Input(shape=SHAPE, dtype=tf.float32))
model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=epochs)
话虽如此,但这导致其他模型的性能降低,尽管我无法将这些模型简化为能证明这种行为的最少示例。
原始帖子的第1部分和第3部分保持打开状态...