Question

即使将网络结构简化为卷积层， RecTensor 仍然需要 99％的时间。我该如何调整？因为我的计算机的显卡是 AMD ，并且我不能使用 TensorFlow-GPU 。所以这里我使用的是 TensorFlow-CPU 。

我所有的代码摘要是（VGG）：

7 * conv2d（32、64、128）
3 * fc（100、100、2）
image_size： 64 * 64 * 3
batch_size： 16
data_format： TFRecords
两个PS，两个工人

我的代码如下：

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time
from tensorflow.python.client import timeline
import tensorflow as tf

flags = tf.app.flags
flags.DEFINE_integer("num_gpus", 0,
                     "Total number of gpus for each machine. If you don't use GPU, please set it to '0'")
flags.DEFINE_integer("replicas_to_aggregate", None,
                     "Number of replicas to aggregate before parameter update "
                     "is applied (For sync_replicas mode only; default: "
                     "num_workers)")
flags.DEFINE_integer("train_steps", 20, "Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 16, "Training batch size")
flags.DEFINE_float("learning_rate", 0.001, "Learning rate")
flags.DEFINE_boolean("sync_replicas", False, "Use the sync_replicas (synchronized replicas) mode")

ps_ip = '***.***.**.16:22227,***.***.**.17:22231'
ip = '***.***.**.18:22229,***.***.**.19:22225'
flags.DEFINE_string('ps_hosts', ps_ip, 'Comma-separated list of hostname:port pairs')
flags.DEFINE_string('worker_hosts', ip, 'Comma-separated list of hostname:port pairs')
flags.DEFINE_string("job_name", None, "job name: worker or ps")
flags.DEFINE_integer("task_index", None, "Worker task index, should be >= 0")

FLAGS = flags.FLAGS
batch_size = FLAGS.batch_size  

def main(unused_argv):
    if FLAGS.job_name is None or FLAGS.job_name == "":
        raise ValueError("Must specify an explicit `job_name`")
    if FLAGS.task_index is None or FLAGS.task_index == "":
        raise ValueError("Must specify an explicit `task_index`")

    print("job name = %s" % FLAGS.job_name)
    print("task index = %d" % FLAGS.task_index)

    ps_spec = FLAGS.ps_hosts.split(",")
    worker_spec = FLAGS.worker_hosts.split(",")

    num_workers = len(worker_spec)

    cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
    server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        is_chief = (FLAGS.task_index == 0)

        with tf.device(tf.train.replica_device_setter(cluster=cluster, 
                                                      worker_device="/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, 0))):

            image_height = 64
            image_width = 64
            num_channels = 3
            num_targets = 2
            IMG_PIXELS = image_height * image_width * num_channels
            TRAIN_FILE = './train.tfrecords'
            TEST_FILE = './test.tfrecords'

            def read_and_decode(filename_queue):
                reader = tf.TFRecordReader()
                _, serialized_example = reader.read(filename_queue)
                features = tf.parse_single_example(serialized_example, features={
                    'label': tf.FixedLenFeature([], tf.int64),
                    'img_raw': tf.FixedLenFeature([], tf.string)
                })
                image = tf.decode_raw(features['img_raw'], tf.uint8)
                label = features['label']

                image.set_shape([IMG_PIXELS])
                image = tf.reshape(image, [image_height, image_width, num_channels])
                image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
                return image, label

            def inputs(data_set, batch_size, num_epochs):
                if not num_epochs:
                    num_epochs = None
                if data_set == 'train':
                    file = TRAIN_FILE
                else:
                    file = TEST_FILE

                with tf.name_scope('input') as scope:
                    filename_queue = tf.train.string_input_producer([file], num_epochs=num_epochs)
                image, label = read_and_decode(filename_queue)
                images, labels = tf.train.shuffle_batch([image, label],
                                                        batch_size=batch_size,
                                                        num_threads=64,
                                                        capacity=320 + 3 * batch_size,
                                                        min_after_dequeue=320
                                                        )
                return images, labels

            global_step = tf.Variable(0, name="global_step", trainable=False) 

            images, targets = inputs(data_set='train', batch_size=batch_size, num_epochs=None)
            test_images, test_targets = inputs(data_set='test', batch_size=batch_size, num_epochs=None)

            def cnn_model(input_images, batch_size):
                def truncated_nomal_var(name, shape, dtype):
                    return (tf.get_variable(name=name, shape=shape, dtype=dtype,
                                            initializer=tf.truncated_normal_initializer(stddev=0.05)))

                def zero_var(name, shape, dtype):
                    return (tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0)))

                with tf.variable_scope('conv1') as scope:
                    conv1_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 3, 32], dtype=tf.float32)
                    conv1 = tf.nn.conv2d(input_images, conv1_kernel, strides=[1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv1_bias = zero_var(name='conv_bias', shape=[32], dtype=tf.float32)
                    conv1_add_bias = tf.nn.bias_add(conv1, conv1_bias)
                    relu_conv1 = tf.nn.relu(conv1_add_bias, name='relu')
                    norm1 = tf.nn.dropout(relu_conv1, keep_prob=0.8, name='dropout')

                with tf.variable_scope('conv2') as scope:
                    conv2_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 32, 32], dtype=tf.float32)
                    conv2 = tf.nn.conv2d(norm1, conv2_kernel, strides=[1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv2_bias = zero_var(name='conv_bias', shape=[32], dtype=tf.float32)
                    conv2_add_bias = tf.nn.bias_add(conv2, conv2_bias)
                    relu_conv2 = tf.nn.relu(conv2_add_bias, name='relu')
                    pool1 = tf.nn.avg_pool(relu_conv2, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
                                       name='pool_layer')
                    norm2 = tf.nn.dropout(pool1, keep_prob=0.8, name='dropout')

                with tf.variable_scope('conv3') as scope:
                    conv3_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 32, 64], dtype=tf.float32)
                    conv3 = tf.nn.conv2d(norm2, conv3_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv3_bias = zero_var(name='conv_bias', shape=[64], dtype=tf.float32)
                    conv3_add_bias = tf.nn.bias_add(conv3, conv3_bias)
                    relu_conv3 = tf.nn.relu(conv3_add_bias, name='relu')
                    norm3 = tf.nn.dropout(relu_conv3, keep_prob=0.7, name='dropout')

                with tf.variable_scope('conv4') as scope:
                    conv4_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 64, 64], dtype=tf.float32)
                    conv4 = tf.nn.conv2d(norm3, conv4_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv4_bias = zero_var(name='conv_bias', shape=[64], dtype=tf.float32)
                    conv4_add_bias = tf.nn.bias_add(conv4, conv4_bias, name='relu')
                    relu_conv4 = tf.nn.relu(conv4_add_bias)
                    pool2 = tf.nn.avg_pool(relu_conv4, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
                                       name='pool_layer')
                    norm4 = tf.nn.dropout(pool2, keep_prob=0.7, name='dropout4')

                with tf.variable_scope('conv5') as scope:
                    conv5_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 64, 128], dtype=tf.float32)
                    conv5 = tf.nn.conv2d(norm4, conv5_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv5_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
                    conv5_add_bias = tf.nn.bias_add(conv5, conv5_bias)
                    relu_conv5 = tf.nn.relu(conv5_add_bias, name='relu')
                    norm5 = tf.nn.dropout(relu_conv5, keep_prob=0.6, name='dropout')

                with tf.variable_scope('conv6') as scope:
                    conv6_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 128, 128], dtype=tf.float32)
                    conv6 = tf.nn.conv2d(norm5, conv6_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv6_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
                    conv6_add_bias = tf.nn.bias_add(conv6, conv6_bias)
                    relu_conv6 = tf.nn.relu(conv6_add_bias, name='relu')
                    norm6 = tf.nn.dropout(relu_conv6, keep_prob=0.6, name='dropout')

                with tf.variable_scope('conv7') as scope:
                    conv7_kernel = truncated_nomal_var(name='conv_kernel', shape=[1, 1, 128, 128], dtype=tf.float32)
                    conv7 = tf.nn.conv2d(norm6, conv7_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
                    conv7_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
                    conv7_add_bias = tf.nn.bias_add(conv7, conv7_bias)
                    relu_conv7 = tf.nn.relu(conv7_add_bias, name='relu')
                    pool7 = tf.nn.avg_pool(relu_conv7, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
                                       name='pool_layer')
                    norm7 = tf.nn.dropout(pool7, keep_prob=0.6, name='dropout')

                reshaped_output = tf.reshape(norm1, [batch_size, -1])
                reshaped_dim = reshaped_output.get_shape()[1].value

                with tf.variable_scope('full1') as scope:
                    full_weigth1 = truncated_nomal_var(name='full_mult', shape=[reshaped_dim, 100], dtype=tf.float32)
                    full_bias1 = zero_var(name='full_bias', shape=[100], dtype=tf.float32)
                    full_layer1 = tf.nn.relu(tf.add(tf.matmul(reshaped_output, full_weigth1), full_bias1), name='relu')

                with tf.variable_scope('full2') as scope:
                    full_weight2 = truncated_nomal_var(name='full_mult', shape=[100, 32], dtype=tf.float32)
                    full_bias2 = zero_var(name='full_bias', shape=[32], dtype=tf.float32)
                    full_layer2 = tf.nn.relu(tf.add(tf.matmul(full_layer1, full_weight2), full_bias2), name='relu')

                with tf.variable_scope('full3') as scope:
                    full_weight3 = truncated_nomal_var(name='full_mult', shape=[32, num_targets], dtype=tf.float32)
                    full_bias3 = zero_var(name='full_bias', shape=[num_targets], dtype=tf.float32)
                    final_output = tf.add(tf.matmul(full_layer2, full_weight3), full_bias3)
                return final_output

            def loss(logits, targets):
                targets = tf.squeeze(tf.cast(targets, tf.int32))
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
                cross_entropy_mean = tf.reduce_mean(cross_entropy)
                return cross_entropy_mean

            def accuracy_of_batch(logits, targets):
                targets = tf.squeeze(tf.cast(targets, tf.int32))
                batch_predictions = tf.cast(tf.argmax(logits, 1), tf.int32)
                predicted_correctly = tf.equal(batch_predictions, targets)
                accuracy = tf.reduce_mean(tf.cast(predicted_correctly, tf.float32))
                return accuracy

            with tf.variable_scope('model_definition') as scope:
                model_output = cnn_model(images, batch_size)
                scope.reuse_variables()
                test_output = cnn_model(test_images, batch_size)

            prediction = tf.nn.softmax(model_output)
            test_prediction = tf.nn.softmax(test_output)
            loss = loss(model_output, targets)

            if FLAGS.sync_replicas:
                # sync
                if FLAGS.replicas_to_aggregate is None:
                    replicas_to_aggregate = num_workers
                else:
                    replicas_to_aggregate = FLAGS.replicas_to_aggregate

                my_optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
                grads_and_vars = my_optimizer.compute_gradients(loss)
                test_acc = accuracy_of_batch(test_prediction, test_targets) 
                train_acc = accuracy_of_batch(prediction, targets)

                opt = tf.train.SyncReplicasOptimizer(my_optimizer,
                                                     replicas_to_aggregate=replicas_to_aggregate,
                                                     total_num_replicas=num_workers,
                                                     name="A01_sync_replicas")
                train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
                init_token_op = opt.get_init_tokens_op()
                chief_queue_runner = opt.get_chief_queue_runner()
            else:
                # async
                my_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
                test_acc = accuracy_of_batch(test_prediction, test_targets) 
                train_acc = accuracy_of_batch(prediction, targets)  
                train_op = my_optimizer.minimize(loss, global_step=global_step)

            init_op = tf.global_variables_initializer()

            sv = tf.train.Supervisor(
                is_chief=is_chief,  
                logdir='./checkout',
                init_op=init_op,  
                recovery_wait_secs=1,
                global_step=global_step)

            if is_chief:
                print("Worker %d: Initializing session..." % FLAGS.task_index)
            else:
                print("Worker %d: Waiting for session to be initialized..." %
                      FLAGS.task_index)

            sess_config = tf.ConfigProto(
                    allow_soft_placement=True,
                    log_device_placement=False,
                    device_filters=["/job:ps","/job:worker/task:%d" % FLAGS.task_index])

            with sv.prepare_or_wait_for_session(server.target, config=sess_config) as sess:

                if FLAGS.task_index == 0 and FLAGS.sync_replicas == True:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_token_op)

                print("Worker %d: Session initialization complete." % FLAGS.task_index)
                print("-------------Training begins--------")

                start_time = time.time()
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord, sess=sess)

                run_metadata = tf.RunMetadata()
                run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

                for i in range(FLAGS.train_steps):
                    _, loss_value, temp_train_acc, temp_test_acc, step = sess.run(
                        [train_op, loss, train_acc, test_acc, global_step], options=run_options, run_metadata=run_metadata)
                    output = 'Loss = {:.5f}'.format(loss_value)
                    test_acc_output = ' --- test_acc = {:.3f}'.format(temp_test_acc)
                    train_acc_output = ' --- train_acc = {:.3f}'.format(temp_train_acc)
                    time_output = '%.2fs' % (time.time() - start_time)
                    STEP = 'train_step: %d | global_step: %d' % (i, step)
                    print(output, test_acc_output, train_acc_output, time_output, STEP)

                    trace = timeline.Timeline(step_stats=run_metadata.step_stats)
                    with open('./timeline.json', 'w') as trace_file:
                        trace_file.write(trace.generate_chrome_trace_format())
                coord.request_stop()
                coord.join(threads)

if __name__ == "__main__":
    tf.app.run()

结果如下：
P1:timeline image

我用了4台笔记本电脑，其中2台用作PS，2台用作笔记本电脑。它的配置是：

8GB内存
CPU：Intel（R）CoreTM i5-4210U CPU @ 1.70GHz X4

更新：2018/8/15 16:34
我使用ps (server)测试了worker (client)和iperf之间的网络带宽传输。

对PS和工人进行测试时，结果如下：
P2:1 ps--1 worker image
1 ps和2个工人同时测试：
P3:1 ps--2 worker image

那么， RecvTensor 占用了很多时间，是否是因为网络带宽较小？从P2中可以看到，每秒11MB。

为什么我的TensorFlow分布式时间轴中的RecTensor需要这么长时间？

0 个答案: