我有一个很小的数据集,非常适合gpu ram。 我的目标是更好地利用我的gpu(目前大约70%),从而减少使用Tensorflow v1.4中的新数据集API的培训时间。
我想在不添加更多图层或增加批量大小的情况下提高gpu的利用率。数据集API如何实现?
以下是我当前实施的简化示例:
import numpy as np
from time import time
import tensorflow as tf
"""
Simple regression example with Dataset API.
The training and val sets are small enought to fit in GPU ram.
"""
TRAIN_SET_SIZE = 130000
VAL_SET_SIZE = 30000
TRAIN_BATCH_SIZE = 100
VAL_BATCH_SIZE = 1000
TRAIN_PREFETCH = 200
VAL_PREFETCH = 1
INPUT_FEATURES = 120
LAYERS = [500, 500, 500, 500, 1] # last layer size should be 1
def fc_layer(in_tensor, in_dim, out_dim, name, act_fun=tf.nn.relu):
with tf.variable_scope(name):
sd = 1.0 / np.sqrt(in_dim)
W_fc = tf.Variable(tf.truncated_normal([in_dim, out_dim], stddev=sd), name='weights')
b_fc = tf.Variable(tf.truncated_normal([out_dim], stddev=sd), name='bias')
z_fc = tf.matmul(in_tensor, W_fc) + b_fc
if act_fun is None:
return z_fc
else:
return act_fun(z_fc)
# Create dummy data
train_set_x = np.random.uniform(low=-1, high=1, size=(TRAIN_SET_SIZE, INPUT_FEATURES)).astype(np.float32)
train_set_y = np.random.uniform(low=-1, high=2, size=(TRAIN_SET_SIZE)).astype(np.float32)
val_set_x = np.random.uniform(low=-1, high=1, size=(VAL_SET_SIZE, INPUT_FEATURES)).astype(np.float32)
val_set_y = np.random.uniform(low=-1, high=2, size=(VAL_SET_SIZE)).astype(np.float32)
# Reset graph
tf.reset_default_graph()
with tf.device('/gpu:0'):
# Dummy train data
train_set = tf.data.Dataset.from_tensor_slices((train_set_x, train_set_y))
# TODO First batch and then prefetch or inverse the order?
# TODO TRAIN_PREFETCH value?
train_set = train_set.shuffle(buffer_size=1000).batch(TRAIN_BATCH_SIZE).prefetch(TRAIN_PREFETCH)
# Dummy val data
val_set = tf.data.Dataset.from_tensor_slices((val_set_x, val_set_y))
# TODO VAL_PREFETCH value?
val_set = val_set.batch(VAL_BATCH_SIZE).prefetch(VAL_PREFETCH)
# Iterator
iterator = tf.data.Iterator.from_structure(train_set.output_types, train_set.output_shapes)
train_init_op = iterator.make_initializer(train_set)
val_init_op = iterator.make_initializer(val_set)
x, truth = iterator.get_next()
# Build graph
activations = []
activations.append(fc_layer(x,
INPUT_FEATURES,
LAYERS[0],
name='fc0'))
for layer_ix in range(1, len(LAYERS) - 1):
activations.append(fc_layer(activations[-1],
LAYERS[layer_ix - 1],
LAYERS[layer_ix],
name='fc' + str(layer_ix)))
activations.append(fc_layer(activations[-1],
LAYERS[-2],
LAYERS[-1],
act_fun=None,
name='fc' + str(len(LAYERS) - 1)))
prediction = activations[-1]
loss = tf.reduce_mean(tf.square(truth - prediction))
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_step = optimizer.minimize(loss, global_step=global_step, name='train_step')
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),
log_device_placement=True,
allow_soft_placement=True))
sess.run(tf.global_variables_initializer())
for e in range(1, 6): # epochs
epoch_start_time = time()
# Train set
sess.run(train_init_op)
print('\nTrain init op time: %.4f' % (time() - epoch_start_time))
while True:
try:
batch_start_time = time()
batch_loss, step, _ = sess.run([loss, global_step, train_step])
# if step % 1000 == 0:
# print('Step: %5d Loss: %.2f, Batch Time : %.5f sec' % (step, batch_loss, time() - batch_start_time))
except tf.errors.OutOfRangeError:
break
# print('Epoch time (without computing val set loss): %.2f' % (time() - epoch_start_time))
# Val set
sess.run(val_init_op)
pred_err = np.ndarray([VAL_SET_SIZE])
ix = 0
while True:
try:
p, t = sess.run([prediction, truth])
pred_err[ix:ix + VAL_BATCH_SIZE] = p.reshape([-1]) - t
ix += VAL_BATCH_SIZE
except tf.errors.OutOfRangeError:
val_loss = np.mean(pred_err ** 2)
print('Epoch: %2d, Loss: %.2f, Epoch time: %.2f sec' % (e, val_loss, time() - epoch_start_time))
break