STANDALONE_CLIENT 模式,当尝试使用估计器进行分布式训练时?

时间:2021-04-04 06:26:56

标签: tensorflow deep-learning conv-neural-network distributed-computing tensorflow-estimator

我正在远程访问我的大学 2 机器。为此,我正在使用 TensorFlow 多工作器镜像策略。我正在尝试在两台机器上部署一个深度模型。为此,代码行是:

os.environ['TF_CONFIG'] = json.dumps({
    'cluster': {
        'worker': ["gpu11.cse.cuhk.edu.hk:8000", "gpu12.cse.cuhk.edu.hk:8000"]
    },
    'task': {'type': 'worker', 'index': 0}
})

我不确定这个工作地址语法,可以吗?

** 完整代码:**

    from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
import os
import json
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.INFO)
from tensorflow.keras.datasets import mnist

os.environ['TF_CONFIG'] = json.dumps({
    'cluster': {
        'worker': ["gpu11.cse.cuhk.edu.hk:8000", "gpu12.cse.cuhk.edu.hk:8000"]
    },
    'task': {'type': 'worker', 'index': 0}
})

def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
    input_layer = tf.cast(input_layer, tf.float32)
    labels = tf.cast(labels, tf.int32)
    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
        inputs=input_layer,
        filters=32,
        kernel_size=[5, 5],
        padding="same",
        activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 28, 28, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
        inputs=pool1,
        filters=64,
        kernel_size=[5, 5],
        padding="same",
        activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 7, 7, 64]
    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
        inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 10]
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


def per_device_batch_size(batch_size, num_gpus):
    """For multi-gpu, batch-size must be a multiple of the number of GPUs.
    Note that this should eventually be handled by DistributionStrategies
    directly. Multi-GPU support is currently experimental, however,
    so doing the work here until that feature is in place.
    Args:
      batch_size: Global batch size to be divided among devices. This should be
        equal to num_gpus times the single-GPU batch_size for multi-gpu training.
      num_gpus: How many GPUs are used with DistributionStrategies.
    Returns:
      Batch size per device.
    Raises:
      ValueError: if batch_size is not divisible by number of devices
    """
    if num_gpus <= 1:
        return batch_size

    remainder = batch_size % num_gpus
    if remainder:
        err = ('When running with multiple GPUs, batch size '
               'must be a multiple of the number of available GPUs. Found {} '
               'GPUs with a batch size of {}; try --batch_size={} instead.'
               ).format(num_gpus, batch_size, batch_size - remainder)
        raise ValueError(err)
    return int(batch_size / num_gpus)


class InputFnProvider:
    def __init__(self, train_batch_size):
        self.train_batch_size = train_batch_size
        self.__load_data()

    def __load_data(self):
        # Load training and eval data

        (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
        #batch_size
       # X_train = tf.cast(X_train, tf.float32)
        #X_test = tf.cast(X_test, tf.float32)
       # mnist = tf.compat.v1.contrib.learn.datasets.load_dataset("mnist")
        self.train_data = X_train # Returns np.array
        self.train_labels = Y_train
        self.eval_data = X_test  # Returns np.array
        self.eval_labels = Y_test

    def train_input_fn(self):
        """An input function for training"""
        # Shuffle, repeat, and batch the examples.
        dataset = tf.data.Dataset.from_tensor_slices(({"x": self.train_data}, self.train_labels))
        dataset = dataset.shuffle(1000).repeat().batch(self.train_batch_size)
        return dataset

    def eval_input_fn(self):
        """An input function for evaluation or prediction"""
        dataset = tf.data.Dataset.from_tensor_slices(({"x": self.eval_data}, self.eval_labels))
        dataset = dataset.batch(1)
        return dataset


def main(unused_argv):
    batch_size = 100
    num_gpus = 2

    # input_fn which serves Dataset
    input_fn_provider = InputFnProvider(per_device_batch_size(batch_size, num_gpus))

    # Use multiple GPUs by MirroredStragtegy.
    # All avaiable GPUs will be used if `num_gpus` is omitted.

    if num_gpus > 1:
        distribution = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    else:
        distribution = None
    # Pass to RunConfig
    config = tf.estimator.RunConfig(
        train_distribute=distribution,
        model_dir="/tmp/mnist_convnet_model")

    # Create the Estimator
    # pass RunConfig
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        config=config)

    # Train the model
    mnist_classifier.train(
        input_fn=input_fn_provider.train_input_fn,
        steps=1000)

    # Evaluate the model and print results
    eval_results = mnist_classifier.evaluate(input_fn=input_fn_provider.eval_input_fn)
    print(eval_results)


if __name__ == "__main__":
    tf.app.run()

错误

回溯(最近一次调用最后一次):

  File "mnist.py", line 223, in <module>
    tf.app.run()
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow/python/platform/app.py", line 40, in run
    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/absl/app.py", line 303, in run
    _run_main(main, args)
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/absl/app.py", line 251, in _run_main
    sys.exit(main(argv))
  File "mnist.py", line 213, in main
    mnist_classifier.train(
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 349, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1173, in _train_model
    return self._train_model_distributed(input_fn, hooks, saving_listeners)
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1226, in _train_model_distributed
    distribute_coordinator_training.estimator_train(
  File "/research/dept8/gds/anafees/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/estimator_training.py", line 310, in estimator_train
    raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
ValueError: Only `STANDALONE_CLIENT` mode is supported when you call `estimator.train`

0 个答案:

没有答案
相关问题