Question

Tensorflow版本1.3.0 操作系统：Windows 10 GPU：Nvidia Quadro M4000 * 2，每个都带有8G GPU内存 GPU模式：一个用于WDDM，一个用于TCC

我在https://github.com/tensorflow/models/blob/master/official/resnet/imagenet_main.py

测试了官方代码

我只是在主函数中添加GPU约束：

def main(unused_argv):
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

# For this line, visible_divice_list set to only "0" and "0, 1" can only support the same batch_size
config = tf.ConfigProto(gpu_options=tf.GPUOptions(visible_device_list='0, 1')) 

resnet_classifier = tf.estimator.Estimator(
  model_fn=imagenet_model_fn, model_dir=FLAGS.model_dir,
  config=tf.contrib.learn.RunConfig(session_config=config))

for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval):
tensors_to_log = {
    'learning_rate': 'learning_rate',
    'cross_entropy': 'cross_entropy',
    'train_accuracy': 'train_accuracy'
}

logging_hook = tf.train.LoggingTensorHook(
    tensors=tensors_to_log, every_n_iter=100)

print('Starting a training cycle.')
resnet_classifier.train(
    input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN),
    steps=FLAGS.first_cycle_steps or FLAGS.steps_per_eval,
    hooks=[logging_hook])
FLAGS.first_cycle_steps = None

print('Starting to evaluate.')
eval_results = resnet_classifier.evaluate(
  input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL))
print(eval_results)

在培训过程中，如果我将可见设备列表设置为＆＃34; 0,1＆＃34;或＆＃34; 0＆＃34;只有，两者都能成功运行batch_size = 48，但BOTH失败，batch_size = 49！这表明未使用第二GPU的存储器，因为当使用两个GPU时批量大小不能更大。我使用Nvidia-smi确认在上述实验中只使用了一个或两个GPU。

我的问题是：

使用两个GPU时，有什么办法可以使用更大的batch_size吗？
如果在Windows中Q1的答案为否，那么在Linux中有什么办法吗？我不熟悉Linux。在Linux中，我可以将所有GPU设置为TCC模式吗？当两个GPU都处于TCC模式时，批量大小会更大吗？谢谢。

------------- ------------更新

我试图在两个GPU上分发数据批量，现在有NaN丢失错误。这有什么可能的原因吗？它之前运行良好（仅使用一个GPU）。但是现在我甚至只将_DEVICE_LIST设置为一个GPU，它仍然会产生NaN丢失错误。

我的修改后的代码是：

def imagenet_model_fn(features, labels, mode):
  tf.summary.image('images', features, max_outputs=6)

  with tf.device('/cpu:0'):
    split_batch = tf.split(features, len(_DEVICE_LIST))
    split_labels = tf.split(labels, len(_DEVICE_LIST))

    all_predictions = {
      'classes': [],
      'probabilities': []
    }
    all_cross_entropy = []
    all_reg_loss = []

    with tf.variable_scope(tf.get_variable_scope()):
      for dev_idx, (device, device_features, device_labels) in enumerate(zip(
        _DEVICE_LIST, split_batch, split_labels)):
        with tf.device(device):
          with tf.name_scope('device_%d' % dev_idx):
            logits = network(inputs=device_features,
                             is_training=(mode == tf.estimator.ModeKeys.TRAIN))

            tf.get_variable_scope().reuse_variables()
            all_predictions['classes'].append(tf.argmax(logits, axis=1))
            all_predictions['probabilities'].append(tf.nn.softmax(logits))

            if mode == tf.estimator.ModeKeys.TRAIN:
            # Calculate loss, which includes softmax cross entropy and L2 regularization.
              cross_entropy = tf.losses.softmax_cross_entropy(
                logits=logits, onehot_labels=device_labels)
              reg_loss = FLAGS.weight_decay * tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

              all_cross_entropy.append(cross_entropy)
              all_reg_loss.append(reg_loss)


    all_predictions['classes'] = tf.reshape(all_predictions['classes'], [-1])
    all_predictions['probabilities'] = tf.reshape(
      all_predictions['probabilities'], [-1])

    total_cross_entropy = tf.add_n(all_cross_entropy)
    total_reg_loss = tf.add_n(all_reg_loss)
    total_loss = total_cross_entropy + total_reg_loss

    tf.identity(total_cross_entropy, name='cross_entropy')
    tf.summary.scalar('cross_entropy', total_cross_entropy)
    tf.summary.scalar('reg_loss', total_reg_loss)
    tf.summary.scalar('total_loss', total_loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
      global_step = tf.train.get_or_create_global_step()
      boundaries = [
        int(batches_per_epoch * epoch) for epoch in [30, 60, 120, 150]]
      values = [
        _INITIAL_LEARNING_RATE * decay for decay in [1, 0.1, 0.01, 1e-3, 1e-4]]
      learning_rate = tf.train.piecewise_constant(
        tf.cast(global_step, tf.int32), boundaries, values)

      tf.identity(learning_rate, name='learning_rate')
      tf.summary.scalar('learning_rate', learning_rate)

      optimizer = tf.train.MomentumOptimizer(
        learning_rate=learning_rate,
        momentum=_MOMENTUM)

    # Batch norm requires update_ops to be added as a train_op dependency.
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(total_loss, global_step)
    else:
      train_op = None

    return tf.estimator.EstimatorSpec(
      mode=mode,
      predictions=all_predictions,
      loss=total_loss,
      train_op=train_op)

错误消息是：

INFO:tensorflow:Saving checkpoints for 1 into F:\projects\DeepLearning\TensorFlow\Models\ImageNet\resnet_101_imagenet_augmented\temp\model.ckpt.
INFO:tensorflow:learning_rate = 0.003125, cross_entropy = 14.394
INFO:tensorflow:loss = 30.0782, step = 1
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
  File "imagenet_main.py", line 321, in <module>
    tf.app.run()
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
    _sys.exit(main(_sys.argv[:1] + flags_passthrough))
  File "imagenet_main.py", line 310, in main
    hooks=[logging_hook])
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
    loss = self._train_model(input_fn=input_fn, hooks=hooks)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 686, in _train_model
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 518, in run
    run_metadata=run_metadata)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 862, in run
    run_metadata=run_metadata)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 818, in run
    return self._sess.run(*args, **kwargs)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 980, in run
    run_metadata=run_metadata))
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 551, in after_run
    raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.

Tensorflow不使用Windows 10中两个GPU的内存

0 个答案: