我有以下代码(如下)根据这里的示例执行tensorflow同步梯度下降: https://github.com/tensorflow/models/blob/master/inception/inception/inception_distributed_train.py
我用于在我有权访问的集群上运行此会话的命令是:
(python2 SyncSGD.py --job_name='ps' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=1 &) && (python2 SyncSGD.py --job_name='worker' --task_index==2 &)
由于某种原因,额外的工作线程永远不会超越sv.prepare_or_wait_for_session
调用。主线程(FLAGS.task_index==0
)运行一个完整的操作时期,然后挂起或超时。在杀死所有线程或等待超时后,其他工作线程超过调用sv.prepare_or_wait_for_session
;然而,他们打印出的迹象表明他们已经到了第一个时代并且已经挂起了。
这也不会一直发生。有时主人永远不会超过第一个时代的运行选择,而其他两个人最终会在大约30秒之后赶上,然后也会在运行操作中挂起。其他时候,一些线程将通过一些随机数量的训练时期(一个线程将产生2个时期,另外两个只有1个时期)。有时甚至完成。我不知道是什么原因引起的;但是,似乎Supervisor(协调器包装器)和SyncReplicasOptimizer存在巨大的同步问题。我搜索过多,但仍然不知道实际上要搜索什么来解决这个问题,这真的是我的最后一招。
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.platform import app
import sys
import time
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
class SynchSGD:
def __init__(self,parameter_servers,workers ):
self.parameter_servers=parameter_servers
self.workers=workers
self.cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# start a server for a specific task
self.server = tf.train.Server(self.cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
def run(self,fetches,fetches_format,dataset,batch_size=1,test_dataset=None,learning_rate=0.001,test_fetches_format=None,training_epochs=20, logs_path='/tmp/mnist/1'):
if FLAGS.job_name == "ps":
self.server.join()
elif FLAGS.job_name == "worker":
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d/cpu:0" % (FLAGS.task_index),#FLAGS.task_index),
cluster=self.cluster)):
print(str(FLAGS.task_index),"b4 gloabl step")
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False)
print(str(FLAGS.task_index),"b4 fetches")
inputs,fetches=fetches(learning_rate,global_step)
if FLAGS.task_index == 0:
chief_queue_runner = fetches[-1]
init_token_op = fetches[-2]
init_op = tf.initialize_all_variables()
print(str(FLAGS.task_index),"b4 sv")
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op,
logdir=logs_path)
begin_time = time.time()
frequency = 100
print(str(FLAGS.task_index),"b4 sesh")
with sv.prepare_or_wait_for_session(self.server.target) as sess:
# is chief
print(str(FLAGS.task_index),"b4 qr")
queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
sv.start_queue_runners(sess, queue_runners)
print(str(FLAGS.task_index)," after qr")
if FLAGS.task_index == 0:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
print(str(FLAGS.task_index)," after cqr")
if 'summary' in fetches_format:
# create log writer object (this will log on every machine)
writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
start_time = time.time()
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(dataset.num_examples/batch_size)
count = 0
print(str(FLAGS.task_index),str(epoch))
for i in range(batch_count):
batch_x, batch_y = dataset.next_batch(batch_size)
# perform the operations we defined earlier on batch
result = sess.run(
fetches[:-2],
feed_dict={inputs[0]: batch_x, inputs[1]: batch_y})
print(str(FLAGS.task_index),str(i))
if 'summary' in fetches_format:
writer.add_summary(result[fetches_format['summary']], result[fetches_format['step']])
count += 1
if count % frequency == 0 or i+1 == batch_count:
elapsed_time = time.time() - start_time
start_time = time.time()
std_out=''
count = 0
#sv.stop()
def main(argv=None):
# cluster specification
parameter_servers = ["localhost:2222"]
workers = [ "localhost:2223",
"localhost:2224",
"localhost:2225"]
# config
batch_size = 100
learning_rate = 0.001
training_epochs = 3
logs_path = "/rscratch/cs194/psharing-neural-nets/sync-logging"
#create variables for model
def fetches(learning_rate, global_step):
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
tf.set_random_seed(1)
with tf.name_scope("weights"):
W1 = tf.Variable(tf.random_normal([784, 100]))
W2 = tf.Variable(tf.random_normal([100, 10]))
# bias
with tf.name_scope("biases"):
b1 = tf.Variable(tf.zeros([100]))
b2 = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
z2 = tf.add(tf.matmul(x,W1),b1)
a2 = tf.nn.sigmoid(z2)
z3 = tf.add(tf.matmul(a2,W2),b2)
y = tf.nn.softmax(z3)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
grad_op = tf.train.GradientDescentOptimizer(learning_rate)
print('len workers %d: '%FLAGS.task_index,str(len(workers)))
rep_op = tf.train.SyncReplicasOptimizer(grad_op,
replicas_to_aggregate=len(workers),
replica_id=FLAGS.task_index,
total_num_replicas=len(workers),
use_locking=True
)
grads = rep_op.compute_gradients(cross_entropy)
apply_gradients_op = rep_op.apply_gradients(grads,global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op=tf.identity(cross_entropy,name='train_op')
#train_op = rep_op.minimize(cross_entropy, global_step=global_step)
#train_op = grad_op.minimize(cross_entropy, global_step=global_step)
init_token_op = rep_op.get_init_tokens_op()
chief_queue_runner = rep_op.get_chief_queue_runner()
with tf.name_scope('Accuracy'):
# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.scalar_summary("cost", cross_entropy)
tf.scalar_summary("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.merge_all_summaries()
return [x,y_],[train_op, cross_entropy, summary_op, global_step, accuracy, init_token_op, chief_queue_runner]
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
dataset = input_data.read_data_sets('MNIST_data', one_hot=True).train
test_dataset = input_data.read_data_sets('MNIST_data', one_hot=True).test
fetches_format = {'train':0,'cost':1,'summary':2,'step':3}
test_fetches_format = {'accuracy':0}
sgd=SynchSGD(parameter_servers,workers)
sgd.run(fetches,fetches_format,dataset,batch_size=batch_size,learning_rate=learning_rate,test_dataset=test_dataset,training_epochs=training_epochs, logs_path=logs_path)
if __name__=="__main__":
app.run()
出于某种原因,它似乎正在创造一堆其他工作(看起来像是与其他工作进行沟通的工作) 在8个K40上生成以下内存分配(python2作业)
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 26489 C ./train-imagenet-sgd 8598MiB |
| 0 28733 C python2 389MiB |
| 0 28735 C python2 123MiB |
| 0 28737 C python2 106MiB |
| 0 28739 C python2 2976MiB |
| 1 26489 C ./train-imagenet-sgd 8598MiB |
| 1 28733 C python2 206MiB |
| 1 28735 C python2 306MiB |
| 1 28737 C python2 106MiB |
| 1 28739 C python2 2976MiB |
| 2 26489 C ./train-imagenet-sgd 8598MiB |
| 2 28733 C python2 206MiB |
| 2 28735 C python2 306MiB |
| 2 28737 C python2 106MiB |
| 2 28739 C python2 2974MiB |
| 3 26489 C ./train-imagenet-sgd 8598MiB |
| 3 28733 C python2 206MiB |
| 3 28735 C python2 306MiB |
| 3 28737 C python2 106MiB |
| 3 28739 C python2 2974MiB |
| 4 26489 C ./train-imagenet-sgd 8598MiB |
| 4 28733 C python2 206MiB |
| 4 28735 C python2 286MiB |
| 4 28737 C python2 106MiB |
| 4 28739 C python2 2976MiB |
| 5 26489 C ./train-imagenet-sgd 8598MiB |
| 5 28733 C python2 2976MiB |
| 5 28735 C python2 306MiB |
| 5 28737 C python2 106MiB |
| 5 28739 C python2 206MiB |
| 6 26489 C ./train-imagenet-sgd 8704MiB |
| 6 28733 C python2 206MiB |
| 6 28735 C python2 106MiB |
| 6 28737 C python2 106MiB |
| 6 28739 C python2 106MiB |
| 7 26489 C ./train-imagenet-sgd 8598MiB |
| 7 28733 C python2 107MiB |
| 7 28735 C python2 106MiB |
| 7 28737 C python2 106MiB |
| 7 28739 C python2 106MiB |
+-----------------------------------------------------------------------------+
如果其他工作不在那里,它将分配更多的工作(并且仍然是错误的)。制作这个巨大的失败的mem分配流:
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
我已经尝试了4天才能使这个工作正常,所以任何帮助都值得赞赏