Tensorflow培训经过一些步骤后陷入困境,如何调查?

时间:2016-08-15 16:09:40

标签: tensorflow

我有一个python脚本来训练Tensorflow模型,类似于CIFAR-10教程中的模型。我有20500个培训示例,每批使用128个示例。我将1,000,000设置为最大步数。然而,在大约164,000步之后,python脚本似乎卡在某处。有没有办法找出脚本卡在哪里?我的最后一招是使用Ctrl-C终止进程并强制它打印出回溯。但我想知道在我杀死这个过程之前是否还有其他事情需要检查。

这是火车循环:

def train(trainingData, batchSize, workingDir, maxSteps):
with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    image, label = readData(trainingData)
    minAfterDequeue = 5000
    capacity = minAfterDequeue + 3 * batchSize
    imageBatch, labelBatch = tf.train.shuffle_batch([image, label], batch_size=batchSize, capacity=capacity, min_after_dequeue=minAfterDequeue)
    #labelBatch = tf.reshape(labelBatch, [batchSize, 1])
    #tf.image_summary('images', imageBatch)
    #tf.histogram_summary('labels', tf.cast(labelBatch, tf.float32))

    logits = network.inference(imageBatch, 0.5)
    #floatLabel = tf.cast(labelBatch, tf.float32)
    #cross_entropy_per_example = tf.nn.softmax_cross_entropy_with_logits(logits, floatLabel)
    loss, cross_entropy = network.loss(logits, labelBatch)
    train_op = network.train(loss, global_step, batchSize)

    # Create a saver
    saver = tf.train.Saver(tf.all_variables())

    summary_op = tf.merge_all_summaries()

    session = tf.Session()
    init = tf.initialize_all_variables()
    session.run(init)
    tf.train.start_queue_runners(sess=session)

    summary_writer = tf.train.SummaryWriter(workingDir, session.graph_def)

    for step in xrange(maxSteps):
        start_time = time.time()
        #l, sm, ce = session.run([floatLabel, logits, cross_entropy_per_example])
        #print l
        #print sm
        #print ce
        _, loss_value = session.run([train_op, loss])
        duration = time.time() - start_time
        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 10 == 0:
            examples_per_sec = batchSize / duration
            format_str = "%s: step %d, loss = %e (%.1f examples/sec; %.3f sec/batch"
            print (format_str % (datetime.now(), step, loss_value, examples_per_sec, float(duration)))

        if step % 100 == 0:
            summary_str = session.run(summary_op)
            summary_writer.add_summary(summary_str, step)

        if step % 1000 == 0 or (step + 1) == maxSteps:
            checkpoint_path = os.path.join(workingDir, 'model.ckpt')
            saver.save(session, checkpoint_path, global_step = step)

以下是用于构建图表的各种函数:

import re
import tensorflow as tf

TOWER_NAME="tower"

NUM_EXAMPLES_PER_EPOCH = 50000
# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.95 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.01       # Initial learning rate.

def _activation_summary(x):
    """Helper to create summaries for activations.
    Creates a summary that provides a histogram of activations.
    Creates a summary that measure the sparsity of activations.
    Args:
        x: Tensor
    Returns:
        nothing
    """
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
    tf.histogram_summary(tensor_name + '/activations', x)
    tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
    #numChannel = tf.shape(x)[3]
    #tf.image_summary(tensor_name + '/image', tf.reshape(x)

def _variable_on_cpu(name, shape, initializer):
    """Helper to create a Variable stored on CPU memory.
    Args:
        name: name of the variable
        shape: list of ints
        initializer: initializer for Variable
    Returns:
        Variable Tensor
    """
    with tf.device('/cpu:0'):
        var = tf.get_variable(name, shape, initializer=initializer, dtype=tf.float32)
    return var

def _variable_with_weight_decay(name, shape, stddev, wd=None):
    """Helper to create an initialized Variable with weight decay.
    Note that the Variable is initialized with a truncated normal distribution.
    A weight decay is added only if one is specified.
    Args:
        name: name of the variable
        shape: list of ints
        stddev: standard deviation of a truncated Gaussian
        wd: add L2Loss weight decay multiplied by this float. If None, weight
            decay is not added for this Variable.
    Returns:
        Variable Tensor
    """
    var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
    if wd is not None:
        weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
    return var

def inference(images, dropout):
    # conv1
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 32], stddev=5e-2)
        conv = tf.nn.conv2d(images, kernel, [1,1,1,1], padding='SAME')
        biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.1))
        bias = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv1)

    # pool1
    pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1')

    # conv2
    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[3, 3, 32, 64], stddev=5e-2)
        conv = tf.nn.conv2d(pool1, kernel, [1,1,1,1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        bias = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv2)

    # pool2
    pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2')

    # conv3
    with tf.variable_scope('conv3') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[3, 3, 64, 64], stddev=5e-2)
        conv = tf.nn.conv2d(pool2, kernel, [1,1,1,1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        bias = tf.nn.bias_add(conv, biases)
        conv3 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv3)

    # pool 3
    pool3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool3')

    # fully connected 4
    with tf.variable_scope('full4') as scope:
        batchSize = pool3.get_shape()[0].value
        flattened = tf.reshape(pool3, [batchSize, -1])
        dim = flattened.get_shape()[1].value
        weights = _variable_with_weight_decay('weights', shape=[dim, 256], stddev=5e-2)
        biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1))
        full4 = tf.nn.relu(tf.matmul(flattened, weights) + biases, name=scope.name)
        full4_dropout = tf.nn.dropout(full4, dropout)
        _activation_summary(full4)
        #_activation_summary(full4_dropout)

    # fully connected 5
    with tf.variable_scope('full5') as scope:
        weights = _variable_with_weight_decay('weights', [256, 128], stddev=5e-2)
        biases = _variable_on_cpu('biases', [128], tf.constant_initializer(0.1))
        full5 = tf.nn.relu(tf.matmul(full4_dropout, weights) + biases, name=scope.name)
        full5_dropout = tf.nn.dropout(full5, dropout)
        _activation_summary(full5)
        #_activation_summary(full5_dropout)

    # softmax
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [128, 2], stddev=1/128.0)
        biases = _variable_on_cpu('biases', [2], tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(full5_dropout, weights), biases, name=scope.name)
        _activation_summary(softmax_linear)

    return softmax_linear

def loss(logits, labels):
    labels = tf.cast(labels, tf.float32)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name='cross_entropy_per_example')
    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
    tf.add_to_collection('losses', cross_entropy_mean)

    return tf.add_n(tf.get_collection('losses'), name='total_loss'), cross_entropy_mean

def _add_loss_summaries(total_loss):
    """Add summaries for losses in CIFAR-10 model.
    Generates moving average for all losses and associated summaries for
    visualizing the performance of the network.
    Args:
        total_loss: Total loss from loss().
    Returns:
        loss_averages_op: op for generating moving averages of losses.
    """
    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    losses = tf.get_collection('losses')
    loss_averages_op = loss_averages.apply(losses + [total_loss])

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Name each loss as '(raw)' and name the moving average version of the loss
        # as the original loss name.
        tf.scalar_summary(l.op.name +' (raw)', l)
        tf.scalar_summary(l.op.name, loss_averages.average(l))

    return loss_averages_op

def train(loss, step, batchSize):
    numBatchesPerEpoch = NUM_EXAMPLES_PER_EPOCH / batchSize
    decay_steps = int(numBatchesPerEpoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)

    loss_averages_op = _add_loss_summaries(loss)

    # compute gradients
    with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.GradientDescentOptimizer(lr)
        grads = opt.compute_gradients(loss)

    # apply gradients
    apply_gradient_op = opt.apply_gradients(grads, global_step = step)

    # add histograms for trainable variables
    for var in tf.trainable_variables():
        tf.histogram_summary(var.op.name, var)

    # add histograms for gradients:
    for grad, var in grads:
        if grad is not None:
            tf.histogram_summary(var.op.name + '/gradients', grad)

    # Track the moving average of all trainable variables
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variable_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op

0 个答案:

没有答案