具有多个gpu的TensorFlow XLA不会同时使用GPU

时间:2017-02-18 19:20:09

标签: tensorflow

我正在尝试在多GPU机器上使用XLA。当我打开XLA JIT时,tensorflow不会同时使用GPU。

当XLA on,gpu0和gpu1交替激活时。

enter image description here

enter image description here

当XLA关闭时,gpu0和gpu1同时处于活动状态。

enter image description here

我的环境会发生什么?

我的代码

import tensorflow as tf
from pathlib import Path
import time

INPUT_SIZE = 64
INPUT_CHANNELS = 1
MINIBATCH_SIZE = 32
NUM_ITERATIONS = 200000
NUM_GPU = 2

def read_op(filename_queue, reader):
    _, raw = reader.read(filename_queue)

    read_image = tf.image.decode_jpeg(
        raw, channels=INPUT_CHANNELS)
    read_image = tf.to_float(read_image) / 255.
    read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE])
    return read_image

def inference(image, log_suffix):
    # autoencoder model for mutli GPU testing
    # this model has no particular meaning
    def w_init(initial_weight=1e-3):
        return tf.truncated_normal_initializer(stddev=initial_weight)

    def make_conv(x, out_ch, stride=[1,1,1,1]):
        shape = x.get_shape().as_list()

        with tf.device('/cpu:0'):
            conv_w = tf.get_variable(initializer=w_init(), name='weight',
                shape=[7, 7, shape[3], out_ch])

        conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME')
        mean, var = tf.nn.moments(conv, [0])
        conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9)

        return tf.nn.relu(conv)

    def make_deconv(x, out_shape, bn=True):
        shape = x.get_shape().as_list()

        with tf.device('/cpu:0'):
            w = tf.get_variable(initializer=w_init(), name='weight',
                shape=[7, 7, out_shape[3], shape[3]])

        deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1])
        mean, var = tf.nn.moments(deconv, [0])

        if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)

        return tf.nn.relu(deconv)

    def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3):
        shape = x.get_shape().as_list()

        with tf.device('/cpu:0'):
            w = tf.get_variable(initializer=w_init(), name='weight',
                shape=[7, 7, out_shape[3], shape[3]])

        deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1])
        mean, var = tf.nn.moments(deconv, [0])

        if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9)

        return activate(deconv)

    with tf.variable_scope('conv1'):
        conv1 = make_conv(image, 128)
    with tf.variable_scope('conv2'):
        conv2 = make_conv(conv1, 128)
    with tf.variable_scope('conv3'):
        conv3 = make_conv(conv2, 160, stride=[1,2,2,1])
    with tf.variable_scope('conv4'):
        conv4 = make_conv(conv3, 160)
    with tf.variable_scope('conv5'):
        conv5 = make_conv(conv4, 192, stride=[1,2,2,1])
    with tf.variable_scope('conv6'):
        conv6 = make_conv(conv5, 192)
    with tf.variable_scope('conv7'):
        conv7 = make_conv(conv6, 256, stride=[1,2,2,1])
    with tf.variable_scope('conv8'):
        conv8 = make_conv(conv7, 256)
    with tf.variable_scope('linear1'):
        feature_lengh = 300
        shape = conv8.get_shape().as_list()
        vec_length = shape[1] * shape[2] * shape[3]
        in_vec = tf.reshape(conv8,[-1, vec_length])

        with tf.device('/cpu:0'):
            w = tf.get_variable(initializer=w_init(1e-2), name='weight',
                shape=[vec_length, feature_lengh])
            b = tf.get_variable(initializer=w_init(1e-2), name='bias',
                shape=[feature_lengh])

        linear1 = tf.matmul(in_vec, w) + b
        mean, var = tf.nn.moments(linear1, [0])
        linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9)
        linear1 = tf.nn.sigmoid(linear1)
    with tf.variable_scope('linear2'):
        in_shape = linear1.get_shape().as_list()
        in_length = in_shape[1]
        out_shape = conv8.get_shape().as_list()
        out_length = out_shape[1] * out_shape[2] * out_shape[3]

        with tf.device('/cpu:0'):
            w = tf.get_variable(initializer=w_init(1e-2), name='weight',
                shape=[in_length, out_length])
            b = tf.get_variable(initializer=w_init(1e-2), name='bias',
                shape=[out_length])

        linear2 = tf.matmul(linear1, w) + b
        mean, var = tf.nn.moments(linear2, [0])
        linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9)
        linear2 = tf.nn.sigmoid(linear2)
        linear2 = tf.reshape(linear2, out_shape)
    with tf.variable_scope('deconv1'):
        deconv1 = make_deconv_same(linear2, conv7.get_shape())
    with tf.variable_scope('deconv2'):
        deconv2 = make_deconv     (deconv1, conv6.get_shape())
    with tf.variable_scope('deconv3'):
        deconv3 = make_deconv_same(deconv2, conv5.get_shape())
    with tf.variable_scope('deconv4'):
        deconv4 = make_deconv     (deconv3, conv4.get_shape())
    with tf.variable_scope('deconv5'):
        deconv5 = make_deconv_same(deconv4, conv3.get_shape())
    with tf.variable_scope('deconv6'):
        deconv6 = make_deconv     (deconv5, conv2.get_shape())
    with tf.variable_scope('deconv7'):
        deconv7 = make_deconv_same(deconv6, conv1.get_shape())
    with tf.variable_scope('deconv8'):
        deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1)

    with tf.device('/cpu:0'):
        image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log'])
        image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log'])

    return deconv8

def loss(label, out, global_step, log_suffix):
    with tf.name_scope('loss'):
        l = tf.squared_difference(label, out)

        # for tensorboard Logarithmic graph mode
        lv = tf.reduce_mean(l) * 1e+7

        with tf.device('/cpu:0'):
            loss_log = tf.summary.scalar('loss'+log_suffix,lv)

    return l

def average_gradients(tower_grads):
    with tf.name_scope('avarage_gradients'):
        average_grads = []

        for grad_and_vars in zip(*tower_grads):
            grads = []

            for g, u in grad_and_vars:
                expanded_g = tf.expand_dims(g,0)
                grads.append(expanded_g)

            grad = tf.concat(grads, axis=0)
            grad = tf.reduce_mean(grad,0)

            v = grad_and_vars[0][1]
            grad_and_var = (grad, v)
            average_grads.append(grad_and_var)

        for grad,var in average_grads:
            with tf.device('/cpu:0'):
                tf.summary.histogram('grads/'+var.name, grad, collections=['grads'])

    return average_grads

def main():
    global NUM_GPU, MINIBATCH_SIZE

    # many jpeg images
    sample_dir = Path('./training_samples')
    file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg']
    file_list = list(map(str, file_list))

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        config_proto = tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False)
        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # if XLA is on, problem occured
        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        #config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        sess = tf.Session( config=config_proto)

        global_step = tf.get_variable(
            'global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        with tf.variable_scope('optimizer'):
            opt = tf.train.AdamOptimizer(1e-6)

        with tf.variable_scope('input'):
            filename_queue = tf.train.string_input_producer(file_list)
            reader = tf.WholeFileReader()
            images_list = [
                tf.train.shuffle_batch(
                    [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8),
                tf.train.shuffle_batch(
                    [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)]

        tower_grads = []
        reuse = False
        for i in range(NUM_GPU):
            with tf.device('/gpu:{}'.format(i)):
                with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)):
                    infer = inference(images_list[i], '/tower_{}'.format(i))
                    reuse = True
                    tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i))

                grads = opt.compute_gradients(tower_loss)
                tower_grads.append( grads )

        grads = average_gradients(tower_grads)
        train_op = opt.apply_gradients(grads, global_step=global_step)

        image_log_op = tf.summary.merge(tf.get_collection('image_log'))
        loss_log_op = tf.summary.merge_all()
        grads_log_op = tf.summary.merge(tf.get_collection('grads'))

        writer = tf.summary.FileWriter('logs')
        sess.run(tf.global_variables_initializer())
        writer.add_graph(tf.get_default_graph())
        coordinator = tf.train.Coordinator()

        threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)

        for i in range(NUM_ITERATIONS):
            print('iteration: ',i)

            start = time.time()

            if i % 2 == 0:
                _, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op])
                writer.add_summary(loss_log, i)
                writer.add_summary(image_log, i)
                writer.flush()
            else:
                _ = sess.run([train_op])

            end = time.time()

            print('time = {}'.format(end - start))

        writer.close()

if __name__ == '__main__':
    main()

环境信息

操作系统:Ubuntu 16.04 GPU:GTX 1080 x2 config选项(gcc): - march = native -O3 配置选项(cuda能力):6.1

已安装的CUDA和cuDNN版本:

/usr/local/cuda/lib64/libcudadevrt.a
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart.so.8.0.44
/usr/local/cuda/lib64/libcudart_static.a
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn.so.5.1.5
/usr/local/cuda/lib64/libcudnn_static.a

tensorflow commit hash c56c873fbaf976d26d487ad57c8efbc87f05331c

bazel version

的输出
.......
Build label: 0.4.4
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar
Build time: Wed Feb 1 18:54:21 2017 (1485975261)
Build timestamp: 1485975261
Build timestamp as int: 1485975261

2 个答案:

答案 0 :(得分:3)

目前,XLA是单GPU。

答案 1 :(得分:0)

不要在会话级别打开JIT编译。您应该使用jit_scope()来确定应将模型的哪一部分包括在XLA JIT编译中,这可以在多次GPU训练中起作用。 有关更多详细信息,您可以参考以下链接: http://danny270degree.blogspot.com/2018/06/xla-jit-how-to-turn-on-xla-jit.html