如何在tf.train.MonitoredTrainingSession API中使用hdfs目录路径来编写日志和检查点

时间:2017-08-23 15:12:08

标签: tensorflow

在分布式tensorflow中,我使用两台不同的机器作为工作服务器,一台机器作为参数服务器。 worker和parameter服务器都需要访问共享目录,在那里他们可以编写日志和检查点。我打算使用hdfs作为共享文件夹,但它无法正常工作。有没有人知道代码中有什么问题,或者除了使用hdfs作为共享文件夹之外,任何人都知道其他选项。     import argparse     import sys

from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf

FLAGS = None

def deepnn(x):
  x_image = tf.reshape(x, [-1, 28, 28, 1])

W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv, keep_prob

def conv2d(x, W):
 return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
 return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                    strides=[1, 2, 2, 1], padding='SAME')

def weight_variable(shape):
 initial = tf.truncated_normal(shape, stddev=0.1)
 return tf.Variable(initial)

def bias_variable(shape):
 initial = tf.constant(0.1, shape=shape)
 return tf.Variable(initial)

def create_queue(job_name, task_index, worker_hosts):
  with tf.device("/job:%s/task:%d" % (job_name, task_index)):
  return tf.FIFOQueue(len(worker_hosts), tf.int32,    shared_name="queue_"+str(job_name)+"_"+str(task_index))

def main(_):
  ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")

# Create a cluster from the parameter server and worker hosts.
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

# Create and start a server for the local task.
server = tf.train.Server(cluster,
                       job_name=FLAGS.job_name,
                       task_index=FLAGS.task_index)

if FLAGS.job_name == "ps":

# Control shutdown of parameter server in queue instead of server.join() function.
queue = create_queue(FLAGS.job_name, FLAGS.task_index, worker_hosts)

with tf.Session(server.target) as sess:
  for i in range(len(worker_hosts)):
    sess.run(queue.dequeue())

 elif FLAGS.job_name == "worker":

# Assigns ops to the local worker by default.
with tf.device(tf.train.replica_device_setter(
    worker_device="/job:worker/task:%d" % FLAGS.task_index,
    cluster=cluster)):

  # Import data
  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

  # Build Deep MNIST model...
  x = tf.placeholder(tf.float32, [None, 784])
  y_ = tf.placeholder(tf.float32, [None, 10])
  y_conv, keep_prob = deepnn(x)

  cross_entropy = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

  global_step = tf.contrib.framework.get_or_create_global_step()

  train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy, global_step=global_step)
  correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Create queues for all servers participating in the cluster.
queue = create_queue(FLAGS.job_name, FLAGS.task_index, worker_hosts)
queues = []
for i in range(len(ps_hosts)):
  queues.append(create_queue("ps", i, worker_hosts))
for i in range(len(worker_hosts)):
  queues.append(create_queue("worker", i, worker_hosts))

# The StopAtStepHook handles stopping after running given steps.
hooks=[tf.train.StopAtStepHook(last_step=1000)]

# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
with tf.train.MonitoredTrainingSession(master=server.target,
                                       is_chief=(FLAGS.task_index == 0),
                                       checkpoint_dir=FLAGS.log_dir,
                                       hooks=hooks) as mon_sess:
  i = 0
  while not mon_sess.should_stop():
    # Run a training step asynchronously.
    batch = mnist.train.next_batch(50)
    if i % 100 == 0:
      train_accuracy = mon_sess.run(accuracy, feed_dict={
          x: batch[0], y_: batch[1], keep_prob: 1.0})
      print('global_step %s, task:%d_step %d, training accuracy %g'
            % (tf.train.global_step(mon_sess, global_step), FLAGS.task_index, i, train_accuracy))
    mon_sess.run(train_step, feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
    i = i + 1

# Notification of task completion and wait for task completion of other worker server.
with tf.Session(server.target) as sess:
  for q in queues:
    sess.run(q.enqueue(1))
  for i in range(len(worker_hosts)):
    sess.run(queue.dequeue())

if __name__ == "__main__":
 parser = argparse.ArgumentParser()
 parser.register("type", "bool", lambda v: v.lower() == "true")
 # Flags for defining the tf.train.ClusterSpec
parser.add_argument(
   "--ps_hosts",
   type=str,
   default="",
   help="Comma-separated list of hostname:port pairs"
)
 parser.add_argument(
    "--worker_hosts",
    type=str,
    default="",
    help="Comma-separated li^st of hostname:port pairs"
  )
 parser.add_argument(
    "--job_name",
    type=str,
    default="",
    help="One of 'ps', 'worker'"
 )
  # Flags for defining the tf.train.Server
  parser.add_argument(
    "--task_index",
    type=int,
    default=0,
    help="Index of task within the job"
  )
  # Flags for specifying input/output directories
  parser.add_argument(
     "--data_dir",
     type=str,
     default="/home/goel/workspace/mnist/mnist",
     help="Directory for storing input data")

   parser.add_argument(
    "--log_dir",
    type=str,
    default="hdfs://SDL/user/goel/logs",
    help="Directory for train logs")

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

当我将“hdfs:// SDL / user / goel / logs”传递给MonitoredTrainingSession API中的检查点变量时,屏幕上会显示以下日志。该程序只是暂停,并没有做任何事情。没有错误,也没有在hdfs中写任何东西。仅供参考,我可以使用pydoop将文件写入同一个hdfs目录。

 Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/hadoop/tmp
2017-08-23 15:13:43.655845: W tensorflow/core/platform   /cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use  SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-23 15:13:43.655884: W tensorflow/core/platform /cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed  up CPU computations.
2017-08-23 15:13:43.655892: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-08-23 15:13:43.655899: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-08-23 15:13:43.665183: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:200] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2845}
2017-08-23 15:13:43.665222: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:200] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2745}
2017-08-23 15:13:43.667440: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:240] Started server with target: grpc://localhost:2745
  Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/hadoop/tmp
  2017-08-23 15:13:44,397 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(62)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 2017-08-23 15:13:45,097 WARN  [main] shortcircuit.DomainSocketFactory (DomainSocketFactory.java:<init>(117)) - The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.

0 个答案:

没有答案