即使将网络结构简化为卷积层, RecTensor 仍然需要 99%的时间。我该如何调整?因为我的计算机的显卡是 AMD ,并且我不能使用 TensorFlow-GPU 。所以这里我使用的是 TensorFlow-CPU 。
我所有的代码摘要是(VGG):
我的代码如下:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from tensorflow.python.client import timeline
import tensorflow as tf
flags = tf.app.flags
flags.DEFINE_integer("num_gpus", 0,
"Total number of gpus for each machine. If you don't use GPU, please set it to '0'")
flags.DEFINE_integer("replicas_to_aggregate", None,
"Number of replicas to aggregate before parameter update "
"is applied (For sync_replicas mode only; default: "
"num_workers)")
flags.DEFINE_integer("train_steps", 20, "Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 16, "Training batch size")
flags.DEFINE_float("learning_rate", 0.001, "Learning rate")
flags.DEFINE_boolean("sync_replicas", False, "Use the sync_replicas (synchronized replicas) mode")
ps_ip = '***.***.**.16:22227,***.***.**.17:22231'
ip = '***.***.**.18:22229,***.***.**.19:22225'
flags.DEFINE_string('ps_hosts', ps_ip, 'Comma-separated list of hostname:port pairs')
flags.DEFINE_string('worker_hosts', ip, 'Comma-separated list of hostname:port pairs')
flags.DEFINE_string("job_name", None, "job name: worker or ps")
flags.DEFINE_integer("task_index", None, "Worker task index, should be >= 0")
FLAGS = flags.FLAGS
batch_size = FLAGS.batch_size
def main(unused_argv):
if FLAGS.job_name is None or FLAGS.job_name == "":
raise ValueError("Must specify an explicit `job_name`")
if FLAGS.task_index is None or FLAGS.task_index == "":
raise ValueError("Must specify an explicit `task_index`")
print("job name = %s" % FLAGS.job_name)
print("task index = %d" % FLAGS.task_index)
ps_spec = FLAGS.ps_hosts.split(",")
worker_spec = FLAGS.worker_hosts.split(",")
num_workers = len(worker_spec)
cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
is_chief = (FLAGS.task_index == 0)
with tf.device(tf.train.replica_device_setter(cluster=cluster,
worker_device="/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, 0))):
image_height = 64
image_width = 64
num_channels = 3
num_targets = 2
IMG_PIXELS = image_height * image_width * num_channels
TRAIN_FILE = './train.tfrecords'
TEST_FILE = './test.tfrecords'
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features={
'label': tf.FixedLenFeature([], tf.int64),
'img_raw': tf.FixedLenFeature([], tf.string)
})
image = tf.decode_raw(features['img_raw'], tf.uint8)
label = features['label']
image.set_shape([IMG_PIXELS])
image = tf.reshape(image, [image_height, image_width, num_channels])
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
return image, label
def inputs(data_set, batch_size, num_epochs):
if not num_epochs:
num_epochs = None
if data_set == 'train':
file = TRAIN_FILE
else:
file = TEST_FILE
with tf.name_scope('input') as scope:
filename_queue = tf.train.string_input_producer([file], num_epochs=num_epochs)
image, label = read_and_decode(filename_queue)
images, labels = tf.train.shuffle_batch([image, label],
batch_size=batch_size,
num_threads=64,
capacity=320 + 3 * batch_size,
min_after_dequeue=320
)
return images, labels
global_step = tf.Variable(0, name="global_step", trainable=False)
images, targets = inputs(data_set='train', batch_size=batch_size, num_epochs=None)
test_images, test_targets = inputs(data_set='test', batch_size=batch_size, num_epochs=None)
def cnn_model(input_images, batch_size):
def truncated_nomal_var(name, shape, dtype):
return (tf.get_variable(name=name, shape=shape, dtype=dtype,
initializer=tf.truncated_normal_initializer(stddev=0.05)))
def zero_var(name, shape, dtype):
return (tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0)))
with tf.variable_scope('conv1') as scope:
conv1_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 3, 32], dtype=tf.float32)
conv1 = tf.nn.conv2d(input_images, conv1_kernel, strides=[1, 1, 1, 1], padding='SAME', name='conv2d')
conv1_bias = zero_var(name='conv_bias', shape=[32], dtype=tf.float32)
conv1_add_bias = tf.nn.bias_add(conv1, conv1_bias)
relu_conv1 = tf.nn.relu(conv1_add_bias, name='relu')
norm1 = tf.nn.dropout(relu_conv1, keep_prob=0.8, name='dropout')
with tf.variable_scope('conv2') as scope:
conv2_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 32, 32], dtype=tf.float32)
conv2 = tf.nn.conv2d(norm1, conv2_kernel, strides=[1, 1, 1, 1], padding='SAME', name='conv2d')
conv2_bias = zero_var(name='conv_bias', shape=[32], dtype=tf.float32)
conv2_add_bias = tf.nn.bias_add(conv2, conv2_bias)
relu_conv2 = tf.nn.relu(conv2_add_bias, name='relu')
pool1 = tf.nn.avg_pool(relu_conv2, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
name='pool_layer')
norm2 = tf.nn.dropout(pool1, keep_prob=0.8, name='dropout')
with tf.variable_scope('conv3') as scope:
conv3_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 32, 64], dtype=tf.float32)
conv3 = tf.nn.conv2d(norm2, conv3_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
conv3_bias = zero_var(name='conv_bias', shape=[64], dtype=tf.float32)
conv3_add_bias = tf.nn.bias_add(conv3, conv3_bias)
relu_conv3 = tf.nn.relu(conv3_add_bias, name='relu')
norm3 = tf.nn.dropout(relu_conv3, keep_prob=0.7, name='dropout')
with tf.variable_scope('conv4') as scope:
conv4_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 64, 64], dtype=tf.float32)
conv4 = tf.nn.conv2d(norm3, conv4_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
conv4_bias = zero_var(name='conv_bias', shape=[64], dtype=tf.float32)
conv4_add_bias = tf.nn.bias_add(conv4, conv4_bias, name='relu')
relu_conv4 = tf.nn.relu(conv4_add_bias)
pool2 = tf.nn.avg_pool(relu_conv4, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
name='pool_layer')
norm4 = tf.nn.dropout(pool2, keep_prob=0.7, name='dropout4')
with tf.variable_scope('conv5') as scope:
conv5_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 64, 128], dtype=tf.float32)
conv5 = tf.nn.conv2d(norm4, conv5_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
conv5_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
conv5_add_bias = tf.nn.bias_add(conv5, conv5_bias)
relu_conv5 = tf.nn.relu(conv5_add_bias, name='relu')
norm5 = tf.nn.dropout(relu_conv5, keep_prob=0.6, name='dropout')
with tf.variable_scope('conv6') as scope:
conv6_kernel = truncated_nomal_var(name='conv_kernel', shape=[3, 3, 128, 128], dtype=tf.float32)
conv6 = tf.nn.conv2d(norm5, conv6_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
conv6_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
conv6_add_bias = tf.nn.bias_add(conv6, conv6_bias)
relu_conv6 = tf.nn.relu(conv6_add_bias, name='relu')
norm6 = tf.nn.dropout(relu_conv6, keep_prob=0.6, name='dropout')
with tf.variable_scope('conv7') as scope:
conv7_kernel = truncated_nomal_var(name='conv_kernel', shape=[1, 1, 128, 128], dtype=tf.float32)
conv7 = tf.nn.conv2d(norm6, conv7_kernel, [1, 1, 1, 1], padding='SAME', name='conv2d')
conv7_bias = zero_var(name='conv_bias', shape=[128], dtype=tf.float32)
conv7_add_bias = tf.nn.bias_add(conv7, conv7_bias)
relu_conv7 = tf.nn.relu(conv7_add_bias, name='relu')
pool7 = tf.nn.avg_pool(relu_conv7, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME',
name='pool_layer')
norm7 = tf.nn.dropout(pool7, keep_prob=0.6, name='dropout')
reshaped_output = tf.reshape(norm1, [batch_size, -1])
reshaped_dim = reshaped_output.get_shape()[1].value
with tf.variable_scope('full1') as scope:
full_weigth1 = truncated_nomal_var(name='full_mult', shape=[reshaped_dim, 100], dtype=tf.float32)
full_bias1 = zero_var(name='full_bias', shape=[100], dtype=tf.float32)
full_layer1 = tf.nn.relu(tf.add(tf.matmul(reshaped_output, full_weigth1), full_bias1), name='relu')
with tf.variable_scope('full2') as scope:
full_weight2 = truncated_nomal_var(name='full_mult', shape=[100, 32], dtype=tf.float32)
full_bias2 = zero_var(name='full_bias', shape=[32], dtype=tf.float32)
full_layer2 = tf.nn.relu(tf.add(tf.matmul(full_layer1, full_weight2), full_bias2), name='relu')
with tf.variable_scope('full3') as scope:
full_weight3 = truncated_nomal_var(name='full_mult', shape=[32, num_targets], dtype=tf.float32)
full_bias3 = zero_var(name='full_bias', shape=[num_targets], dtype=tf.float32)
final_output = tf.add(tf.matmul(full_layer2, full_weight3), full_bias3)
return final_output
def loss(logits, targets):
targets = tf.squeeze(tf.cast(targets, tf.int32))
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
cross_entropy_mean = tf.reduce_mean(cross_entropy)
return cross_entropy_mean
def accuracy_of_batch(logits, targets):
targets = tf.squeeze(tf.cast(targets, tf.int32))
batch_predictions = tf.cast(tf.argmax(logits, 1), tf.int32)
predicted_correctly = tf.equal(batch_predictions, targets)
accuracy = tf.reduce_mean(tf.cast(predicted_correctly, tf.float32))
return accuracy
with tf.variable_scope('model_definition') as scope:
model_output = cnn_model(images, batch_size)
scope.reuse_variables()
test_output = cnn_model(test_images, batch_size)
prediction = tf.nn.softmax(model_output)
test_prediction = tf.nn.softmax(test_output)
loss = loss(model_output, targets)
if FLAGS.sync_replicas:
# sync
if FLAGS.replicas_to_aggregate is None:
replicas_to_aggregate = num_workers
else:
replicas_to_aggregate = FLAGS.replicas_to_aggregate
my_optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
grads_and_vars = my_optimizer.compute_gradients(loss)
test_acc = accuracy_of_batch(test_prediction, test_targets)
train_acc = accuracy_of_batch(prediction, targets)
opt = tf.train.SyncReplicasOptimizer(my_optimizer,
replicas_to_aggregate=replicas_to_aggregate,
total_num_replicas=num_workers,
name="A01_sync_replicas")
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
else:
# async
my_optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
test_acc = accuracy_of_batch(test_prediction, test_targets)
train_acc = accuracy_of_batch(prediction, targets)
train_op = my_optimizer.minimize(loss, global_step=global_step)
init_op = tf.global_variables_initializer()
sv = tf.train.Supervisor(
is_chief=is_chief,
logdir='./checkout',
init_op=init_op,
recovery_wait_secs=1,
global_step=global_step)
if is_chief:
print("Worker %d: Initializing session..." % FLAGS.task_index)
else:
print("Worker %d: Waiting for session to be initialized..." %
FLAGS.task_index)
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps","/job:worker/task:%d" % FLAGS.task_index])
with sv.prepare_or_wait_for_session(server.target, config=sess_config) as sess:
if FLAGS.task_index == 0 and FLAGS.sync_replicas == True:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
print("Worker %d: Session initialization complete." % FLAGS.task_index)
print("-------------Training begins--------")
start_time = time.time()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord, sess=sess)
run_metadata = tf.RunMetadata()
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
for i in range(FLAGS.train_steps):
_, loss_value, temp_train_acc, temp_test_acc, step = sess.run(
[train_op, loss, train_acc, test_acc, global_step], options=run_options, run_metadata=run_metadata)
output = 'Loss = {:.5f}'.format(loss_value)
test_acc_output = ' --- test_acc = {:.3f}'.format(temp_test_acc)
train_acc_output = ' --- train_acc = {:.3f}'.format(temp_train_acc)
time_output = '%.2fs' % (time.time() - start_time)
STEP = 'train_step: %d | global_step: %d' % (i, step)
print(output, test_acc_output, train_acc_output, time_output, STEP)
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
with open('./timeline.json', 'w') as trace_file:
trace_file.write(trace.generate_chrome_trace_format())
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
tf.app.run()
结果如下:
P1:timeline image
我用了4台笔记本电脑,其中2台用作PS,2台用作笔记本电脑。它的配置是:
更新:2018/8/15 16:34
我使用ps (server)
测试了worker (client)
和iperf
之间的网络带宽传输。
对PS和工人进行测试时,结果如下:
P2:1 ps--1 worker image
1 ps和2个工人同时测试:
P3:1 ps--2 worker image
那么, RecvTensor 占用了很多时间,是否是因为网络带宽较小?从P2中可以看到,每秒11MB。