Question

我一直在尝试将官方Custom training with tf.distribute.Strategy教程应用于另一位官方Image segmentation，并得到以下代码。它可以在2个GPU上很好地运行，除了训练损失的规模根本不好：

Epoch 1, Loss: 35478.49609375, Accuracy: 48.560935974121094, Test Loss: 0.8764073252677917, Test Accuracy: 57.97665023803711
Epoch 2, Loss: 20161.634765625, Accuracy: 74.82583618164062, Test Loss: 0.6519305109977722, Test Accuracy: 77.33595275878906
Epoch 3, Loss: 15657.2880859375, Accuracy: 81.60499572753906, Test Loss: 0.5801540017127991, Test Accuracy: 79.94847106933594
Epoch 4, Loss: 13322.1689453125, Accuracy: 84.52685546875, Test Loss: 0.5113006830215454, Test Accuracy: 82.27192687988281
Epoch 5, Loss: 11845.38671875, Accuracy: 85.9767837524414, Test Loss: 0.4614977538585663, Test Accuracy: 83.19354248046875
Epoch 6, Loss: 10827.380859375, Accuracy: 86.9468002319336, Test Loss: 0.43975135684013367, Test Accuracy: 83.65667724609375
Epoch 7, Loss: 10006.4892578125, Accuracy: 87.75154113769531, Test Loss: 0.4181833863258362, Test Accuracy: 83.8880386352539
Epoch 8, Loss: 9534.9345703125, Accuracy: 88.15916442871094, Test Loss: 0.40620285272598267, Test Accuracy: 84.22107696533203
Epoch 9, Loss: 8993.767578125, Accuracy: 88.73575592041016, Test Loss: 0.3957768976688385, Test Accuracy: 84.42972564697266
Epoch 10, Loss: 8425.7080078125, Accuracy: 89.38662719726562, Test Loss: 0.37987643480300903, Test Accuracy: 84.94923400878906

我的代码有什么问题？我只是复制了教程中的内容。

预先感谢您的回答！

代码：

import sys, os
import tensorflow as tf
from tensorflow_examples.models.pix2pix import pix2pix

import tensorflow_datasets as tfds

import time


def normalize(input_image, input_mask):
  input_image = tf.cast(input_image, tf.float32) / 255.0
  input_mask -= 1
  return input_image, input_mask

@tf.function
def load_image_train(datapoint):
  input_image = tf.image.resize(datapoint['image'], (128, 128))
  input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128))

  if tf.random.uniform(()) > 0.5:
    input_image = tf.image.flip_left_right(input_image)
    input_mask = tf.image.flip_left_right(input_mask)

  input_image, input_mask = normalize(input_image, input_mask)

  return input_image, input_mask

def load_image_test(datapoint):
  input_image = tf.image.resize(datapoint['image'], (128, 128))
  input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128))

  input_image, input_mask = normalize(input_image, input_mask)

  return input_image, input_mask


def main():
    dataset, info = tfds.load('oxford_iiit_pet:3.0.0', with_info=True)
    TRAIN_LENGTH = info.splits['train'].num_examples
    BATCH_SIZE = 192
    BUFFER_SIZE = 1000
    STEPS_PER_EPOCH = TRAIN_LENGTH // BATCH_SIZE
    train = dataset['train'].map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    test = dataset['test'].map(load_image_test)
    train_dataset = train.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
    train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    test_dataset = test.batch(BATCH_SIZE)



    OUTPUT_CHANNELS = 3

    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():

        base_model = tf.keras.applications.MobileNetV2(input_shape=[128, 128, 3], include_top=False)

        # Use the activations of these layers
        layer_names = [
            'block_1_expand_relu',   # 64x64
            'block_3_expand_relu',   # 32x32
            'block_6_expand_relu',   # 16x16
            'block_13_expand_relu',  # 8x8
            'block_16_project',      # 4x4
        ]
        layers = [base_model.get_layer(name).output for name in layer_names]

        # Create the feature extraction model
        down_stack = tf.keras.Model(inputs=base_model.input, outputs=layers)

        down_stack.trainable = False

        up_stack = [
            pix2pix.upsample(512, 3),  # 4x4 -> 8x8
            pix2pix.upsample(256, 3),  # 8x8 -> 16x16
            pix2pix.upsample(128, 3),  # 16x16 -> 32x32
            pix2pix.upsample(64, 3),   # 32x32 -> 64x64
        ]
        def unet_model(output_channels):

            # This is the last layer of the model
            last = tf.keras.layers.Conv2DTranspose(
                output_channels, 3, strides=2,
                padding='same', activation='softmax')  #64x64 -> 128x128

            inputs = tf.keras.layers.Input(shape=[128, 128, 3])
            x = inputs

            # Downsampling through the model
            skips = down_stack(x)
            x = skips[-1]
            skips = reversed(skips[:-1])

            # Upsampling and establishing the skip connections
            for up, skip in zip(up_stack, skips):
                x = up(x)
                concat = tf.keras.layers.Concatenate()
                x = concat([x, skip])

            x = last(x)

            return tf.keras.Model(inputs=inputs, outputs=x)


        model = unet_model(OUTPUT_CHANNELS)

        optimizer=tf.keras.optimizers.Adam(3e-4)

    EPOCHS = 2
    VALIDATION_STEPS = info.splits['test'].num_examples//BATCH_SIZE

    print('Start training')
    start = time.time()

    epochs = EPOCHS; steps = STEPS_PER_EPOCH; GLOBAL_BATCH_SIZE = BATCH_SIZE

    # Distribute the datasets

    train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

    with strategy.scope():
        # Set reduction to `none` so we can do the reduction afterwards and divide by
        # global batch size.
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE)
        # or loss_fn = tf.keras.losses.sparse_categorical_crossentropy
        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    with strategy.scope():
        test_loss = tf.keras.metrics.Mean(name='test_loss')

        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy')
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            name='test_accuracy')


    with strategy.scope():
        def train_step(inputs):
            images, labels = inputs

            with tf.GradientTape() as tape:
                predictions = model(images, training=True)
                loss = compute_loss(labels, predictions)

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            train_accuracy.update_state(labels, predictions)
            return loss 

        def test_step(inputs):
            images, labels = inputs

            predictions = model(images, training=False)
            t_loss = loss_object(labels, predictions)

            test_loss.update_state(t_loss)
            test_accuracy.update_state(labels, predictions)

    with strategy.scope():
        # `experimental_run_v2` replicates the provided computation and runs it
        # with the distributed input.
        @tf.function
        def distributed_train_step(dataset_inputs):
            per_replica_losses = strategy.experimental_run_v2(train_step,
                                                            args=(dataset_inputs,))
            return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                                axis=None)

        @tf.function
        def distributed_test_step(dataset_inputs):
            return strategy.experimental_run_v2(test_step, args=(dataset_inputs,))

        train_iter = iter(train_dataset)
        for epoch in range(EPOCHS):
            # TRAIN LOOP
            total_loss = 0.0
            num_batches = 0
            while True:
                x = next(train_iter)
                total_loss += distributed_train_step(x)
                num_batches += 1
                if num_batches >= steps:
                    break
            train_loss = total_loss / num_batches

            # TEST LOOP
            for x in test_dist_dataset:
                distributed_test_step(x)

            # if epoch % 2 == 0:
            #     checkpoint.save(checkpoint_prefix)

            template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
                        "Test Accuracy: {}")
            print (template.format(epoch+1, train_loss,
                                train_accuracy.result()*100, test_loss.result(),
                                test_accuracy.result()*100))

            test_loss.reset_states()
            train_accuracy.reset_states()
            test_accuracy.reset_states()

if __name__ == "__main__":
    main()

TensorFlow 2.0分布式定制培训循环：错误的损失规模

0 个答案: