我正在尝试在多GPU机器上使用XLA。当我打开XLA JIT时,tensorflow不会同时使用GPU。
当XLA on,gpu0和gpu1交替激活时。
当XLA关闭时,gpu0和gpu1同时处于活动状态。
我的环境会发生什么?
import tensorflow as tf
from pathlib import Path
import time
INPUT_SIZE = 64
INPUT_CHANNELS = 1
MINIBATCH_SIZE = 32
NUM_ITERATIONS = 200000
NUM_GPU = 2
def read_op(filename_queue, reader):
_, raw = reader.read(filename_queue)
read_image = tf.image.decode_jpeg(
raw, channels=INPUT_CHANNELS)
read_image = tf.to_float(read_image) / 255.
read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE])
return read_image
def inference(image, log_suffix):
# autoencoder model for mutli GPU testing
# this model has no particular meaning
def w_init(initial_weight=1e-3):
return tf.truncated_normal_initializer(stddev=initial_weight)
def make_conv(x, out_ch, stride=[1,1,1,1]):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
conv_w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, shape[3], out_ch])
conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME')
mean, var = tf.nn.moments(conv, [0])
conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9)
return tf.nn.relu(conv)
def make_deconv(x, out_shape, bn=True):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, out_shape[3], shape[3]])
deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1])
mean, var = tf.nn.moments(deconv, [0])
if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)
return tf.nn.relu(deconv)
def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3):
shape = x.get_shape().as_list()
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(), name='weight',
shape=[7, 7, out_shape[3], shape[3]])
deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1])
mean, var = tf.nn.moments(deconv, [0])
if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)
return activate(deconv)
with tf.variable_scope('conv1'):
conv1 = make_conv(image, 128)
with tf.variable_scope('conv2'):
conv2 = make_conv(conv1, 128)
with tf.variable_scope('conv3'):
conv3 = make_conv(conv2, 160, stride=[1,2,2,1])
with tf.variable_scope('conv4'):
conv4 = make_conv(conv3, 160)
with tf.variable_scope('conv5'):
conv5 = make_conv(conv4, 192, stride=[1,2,2,1])
with tf.variable_scope('conv6'):
conv6 = make_conv(conv5, 192)
with tf.variable_scope('conv7'):
conv7 = make_conv(conv6, 256, stride=[1,2,2,1])
with tf.variable_scope('conv8'):
conv8 = make_conv(conv7, 256)
with tf.variable_scope('linear1'):
feature_lengh = 300
shape = conv8.get_shape().as_list()
vec_length = shape[1] * shape[2] * shape[3]
in_vec = tf.reshape(conv8,[-1, vec_length])
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(1e-2), name='weight',
shape=[vec_length, feature_lengh])
b = tf.get_variable(initializer=w_init(1e-2), name='bias',
shape=[feature_lengh])
linear1 = tf.matmul(in_vec, w) + b
mean, var = tf.nn.moments(linear1, [0])
linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9)
linear1 = tf.nn.sigmoid(linear1)
with tf.variable_scope('linear2'):
in_shape = linear1.get_shape().as_list()
in_length = in_shape[1]
out_shape = conv8.get_shape().as_list()
out_length = out_shape[1] * out_shape[2] * out_shape[3]
with tf.device('/cpu:0'):
w = tf.get_variable(initializer=w_init(1e-2), name='weight',
shape=[in_length, out_length])
b = tf.get_variable(initializer=w_init(1e-2), name='bias',
shape=[out_length])
linear2 = tf.matmul(linear1, w) + b
mean, var = tf.nn.moments(linear2, [0])
linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9)
linear2 = tf.nn.sigmoid(linear2)
linear2 = tf.reshape(linear2, out_shape)
with tf.variable_scope('deconv1'):
deconv1 = make_deconv_same(linear2, conv7.get_shape())
with tf.variable_scope('deconv2'):
deconv2 = make_deconv (deconv1, conv6.get_shape())
with tf.variable_scope('deconv3'):
deconv3 = make_deconv_same(deconv2, conv5.get_shape())
with tf.variable_scope('deconv4'):
deconv4 = make_deconv (deconv3, conv4.get_shape())
with tf.variable_scope('deconv5'):
deconv5 = make_deconv_same(deconv4, conv3.get_shape())
with tf.variable_scope('deconv6'):
deconv6 = make_deconv (deconv5, conv2.get_shape())
with tf.variable_scope('deconv7'):
deconv7 = make_deconv_same(deconv6, conv1.get_shape())
with tf.variable_scope('deconv8'):
deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1)
with tf.device('/cpu:0'):
image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log'])
image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log'])
return deconv8
def loss(label, out, global_step, log_suffix):
with tf.name_scope('loss'):
l = tf.squared_difference(label, out)
# for tensorboard Logarithmic graph mode
lv = tf.reduce_mean(l) * 1e+7
with tf.device('/cpu:0'):
loss_log = tf.summary.scalar('loss'+log_suffix,lv)
return l
def average_gradients(tower_grads):
with tf.name_scope('avarage_gradients'):
average_grads = []
for grad_and_vars in zip(*tower_grads):
grads = []
for g, u in grad_and_vars:
expanded_g = tf.expand_dims(g,0)
grads.append(expanded_g)
grad = tf.concat(grads, axis=0)
grad = tf.reduce_mean(grad,0)
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
for grad,var in average_grads:
with tf.device('/cpu:0'):
tf.summary.histogram('grads/'+var.name, grad, collections=['grads'])
return average_grads
def main():
global NUM_GPU, MINIBATCH_SIZE
# many jpeg images
sample_dir = Path('./training_samples')
file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg']
file_list = list(map(str, file_list))
with tf.Graph().as_default(), tf.device('/cpu:0'):
config_proto = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# if XLA is on, problem occured
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session( config=config_proto)
global_step = tf.get_variable(
'global_step', [], initializer=tf.constant_initializer(0), trainable=False)
with tf.variable_scope('optimizer'):
opt = tf.train.AdamOptimizer(1e-6)
with tf.variable_scope('input'):
filename_queue = tf.train.string_input_producer(file_list)
reader = tf.WholeFileReader()
images_list = [
tf.train.shuffle_batch(
[read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8),
tf.train.shuffle_batch(
[read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)]
tower_grads = []
reuse = False
for i in range(NUM_GPU):
with tf.device('/gpu:{}'.format(i)):
with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)):
infer = inference(images_list[i], '/tower_{}'.format(i))
reuse = True
tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i))
grads = opt.compute_gradients(tower_loss)
tower_grads.append( grads )
grads = average_gradients(tower_grads)
train_op = opt.apply_gradients(grads, global_step=global_step)
image_log_op = tf.summary.merge(tf.get_collection('image_log'))
loss_log_op = tf.summary.merge_all()
grads_log_op = tf.summary.merge(tf.get_collection('grads'))
writer = tf.summary.FileWriter('logs')
sess.run(tf.global_variables_initializer())
writer.add_graph(tf.get_default_graph())
coordinator = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
for i in range(NUM_ITERATIONS):
print('iteration: ',i)
start = time.time()
if i % 2 == 0:
_, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op])
writer.add_summary(loss_log, i)
writer.add_summary(image_log, i)
writer.flush()
else:
_ = sess.run([train_op])
end = time.time()
print('time = {}'.format(end - start))
writer.close()
if __name__ == '__main__':
main()
操作系统:Ubuntu 16.04 GPU:GTX 1080 x2 config选项(gcc): - march = native -O3 配置选项(cuda能力):6.1
已安装的CUDA和cuDNN版本:
/usr/local/cuda/lib64/libcudadevrt.a
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart_static.a
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn_static.a
tensorflow commit hash c56c873fbaf976d26d487ad57c8efbc87f05331c
bazel version
.......
Build label: 0.4.4
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar
Build time: Wed Feb 1 18:54:21 2017 (1485975261)
Build timestamp: 1485975261
Build timestamp as int: 1485975261
答案 0 :(得分:3)
目前,XLA是单GPU。
答案 1 :(得分:0)
不要在会话级别打开JIT编译。您应该使用jit_scope()来确定应将模型的哪一部分包括在XLA JIT编译中,这可以在多次GPU训练中起作用。 有关更多详细信息,您可以参考以下链接: http://danny270degree.blogspot.com/2018/06/xla-jit-how-to-turn-on-xla-jit.html