Question

我一直在玩TensorFlow Cifar10演示（https://www.tensorflow.org/versions/master/tutorials/deep_cnn/index.html#convolutional-neural-networks），当我在Google内部实施批量大小为1的AlexNet（227x227图像）时，我遇到了以下有趣的输出。演示代码：

2016-01-12 13:19:28.042847: step 0, loss = 208.40 (0.0 examples/sec; 28.081 sec/batch) 2016-01-12 13:19:37.991020: step 10, loss = 106981280.00 (2.3 examples/sec; 0.440 sec/batch) 2016-01-12 13:19:42.422207: step 20, loss = 318574152697306415104.00 (2.2 examples/sec; 0.446 sec/batch) 2016-01-12 13:19:46.824720: step 30, loss = 316031624417051148288.00 (2.3 examples/sec; 0.439 sec/batch) 2016-01-12 13:19:51.238086: step 40, loss = 313512247453630332928.00 (2.2 examples/sec; 0.449 sec/batch) 2016-01-12 13:19:55.676777: step 50, loss = 311012714476067618816.00 (2.3 examples/sec; 0.436 sec/batch) 2016-01-12 13:20:00.056848: step 60, loss = 308535664312269668352.00 (2.3 examples/sec; 0.438 sec/batch) 2016-01-12 13:20:04.442751: step 70, loss = 306075115618981380096.00 (2.3 examples/sec; 0.436 sec/batch)

为什么几批后损失价值如此巨大？网络也似乎没有收敛到任何随机的东西（我训练它以确定照片中的色彩平衡有多好）。我只会发布我更改的代码 - 除了小目录名称更改和图像大小参数外，其余部分可在TensorFlow网站上找到。如果您需要查看更多代码，请告诉我。

def inference(images):
  """Build the CIFAR-10 model.

  Args:
    images: Images returned from distorted_inputs() or inputs().

  Returns:
    Logits.
  """
  # We instantiate all variables using tf.get_variable() instead of
  # tf.Variable() in order to share variables across multiple GPU training runs.
  # If we only ran this model on a single GPU, we could simplify this function
  # by replacing all instances of tf.get_variable() with tf.Variable().
  #
  # conv1
  with tf.variable_scope('conv1') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[11, 11, 3, 96],
      stddev=1e-4, wd=0.0)
    conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [96], tf.constant_initializer(0.0))
    bias = tf.nn.bias_add(conv, biases)
    conv1 = tf.nn.relu6(bias, name=scope.name)
    _activation_summary(conv1)

  # norm1
  norm1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.0001, beta=0.75,
    name='norm1')

  # pool1
  pool1 = tf.nn.max_pool(norm1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
    padding='SAME', name='pool1')

  # conv2
  with tf.variable_scope('conv2') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[5, 5, 96, 256],
      stddev=1e-4, wd=0.0)
    conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1))
    bias = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu6(bias, name=scope.name)
    _activation_summary(conv2)

  # norm2
  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.0001, beta=0.75,
    name='norm2')

  # pool2
  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
    strides=[1, 2, 2, 1], padding='SAME', name='pool2')

  # conv3
  with tf.variable_scope('conv3') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[3, 3, 256, 384],
      stddev=1e-4, wd=0.0)
    conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
    bias = tf.nn.bias_add(conv, biases)
    conv3 = tf.nn.relu6(bias, name=scope.name)
    _activation_summary(conv3)

  # conv4
  with tf.variable_scope('conv4') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[3, 3, 384, 384],
      stddev=1e-4, wd=0.0)
    conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
    bias = tf.nn.bias_add(conv, biases)
    conv4 = tf.nn.relu6(bias, name=scope.name)
    _activation_summary(conv4)

  # conv5
  with tf.variable_scope('conv5') as scope:
    kernel = _variable_with_weight_decay('weights', shape=[3, 3, 384, 256],
      stddev=1e-4, wd=0.0)
    conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1))
    bias = tf.nn.bias_add(conv, biases)
    conv5 = tf.nn.relu6(bias, name=scope.name)
    _activation_summary(conv5)  

  # pool5
  pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1],
    strides=[1, 2, 2, 1], padding='SAME', name='pool5')


  # local6
  with tf.variable_scope('local6') as scope:
    # Move everything into depth so we can perform a single matrix multiply.
    dim = 1
    for d in pool5.get_shape()[1:].as_list():
      dim *= d
    reshape = tf.reshape(pool5, [FLAGS.batch_size, dim])

    weights = _variable_with_weight_decay('weights', shape=[dim, 4096],
      stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [4096], tf.constant_initializer(0.1))
    local6 = tf.nn.relu_layer(reshape, weights, biases, name=scope.name)
    _activation_summary(local6)

  # local7
  with tf.variable_scope('local7') as scope:
    weights = _variable_with_weight_decay('weights', shape=[4096, 4096],
      stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [4096], tf.constant_initializer(0.1))
    local7 = tf.nn.relu_layer(local6, weights, biases, name=scope.name)
    _activation_summary(local7)

  # softmax, i.e. softmax(WX + b)
  with tf.variable_scope('softmax_linear') as scope:
    weights = _variable_with_weight_decay('weights', [4096, NUM_CLASSES],
      stddev=1/4096.0, wd=0.0)
    biases = _variable_on_cpu('biases', [NUM_CLASSES],
      tf.constant_initializer(0.0))
    softmax_linear = tf.nn.xw_plus_b(local7, weights, biases, name=scope.name)
    _activation_summary(softmax_linear)

  return softmax_linear

TensorFlow卷积巨大的损失函数值

0 个答案: