Tensorflow版本1.3.0 操作系统:Windows 10 GPU:Nvidia Quadro M4000 * 2,每个都带有8G GPU内存 GPU模式:一个用于WDDM,一个用于TCC
我在https://github.com/tensorflow/models/blob/master/official/resnet/imagenet_main.py
测试了官方代码我只是在主函数中添加GPU约束:
def main(unused_argv):
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
# For this line, visible_divice_list set to only "0" and "0, 1" can only support the same batch_size
config = tf.ConfigProto(gpu_options=tf.GPUOptions(visible_device_list='0, 1'))
resnet_classifier = tf.estimator.Estimator(
model_fn=imagenet_model_fn, model_dir=FLAGS.model_dir,
config=tf.contrib.learn.RunConfig(session_config=config))
for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval):
tensors_to_log = {
'learning_rate': 'learning_rate',
'cross_entropy': 'cross_entropy',
'train_accuracy': 'train_accuracy'
}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=100)
print('Starting a training cycle.')
resnet_classifier.train(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN),
steps=FLAGS.first_cycle_steps or FLAGS.steps_per_eval,
hooks=[logging_hook])
FLAGS.first_cycle_steps = None
print('Starting to evaluate.')
eval_results = resnet_classifier.evaluate(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL))
print(eval_results)
在培训过程中,如果我将可见设备列表设置为" 0,1"或" 0"只有,两者都能成功运行batch_size = 48,但BOTH失败,batch_size = 49!这表明未使用第二GPU的存储器,因为当使用两个GPU时批量大小不能更大。我使用Nvidia-smi确认在上述实验中只使用了一个或两个GPU。
我的问题是:
使用两个GPU时,有什么办法可以使用更大的batch_size吗?
如果在Windows中Q1的答案为否,那么在Linux中有什么办法吗?我不熟悉Linux。在Linux中,我可以将所有GPU设置为TCC模式吗?当两个GPU都处于TCC模式时,批量大小会更大吗? 谢谢。
------------- ------------更新
我试图在两个GPU上分发数据批量,现在有NaN丢失错误。这有什么可能的原因吗?它之前运行良好(仅使用一个GPU)。但是现在我甚至只将_DEVICE_LIST设置为一个GPU,它仍然会产生NaN丢失错误。
我的修改后的代码是:
def imagenet_model_fn(features, labels, mode):
tf.summary.image('images', features, max_outputs=6)
with tf.device('/cpu:0'):
split_batch = tf.split(features, len(_DEVICE_LIST))
split_labels = tf.split(labels, len(_DEVICE_LIST))
all_predictions = {
'classes': [],
'probabilities': []
}
all_cross_entropy = []
all_reg_loss = []
with tf.variable_scope(tf.get_variable_scope()):
for dev_idx, (device, device_features, device_labels) in enumerate(zip(
_DEVICE_LIST, split_batch, split_labels)):
with tf.device(device):
with tf.name_scope('device_%d' % dev_idx):
logits = network(inputs=device_features,
is_training=(mode == tf.estimator.ModeKeys.TRAIN))
tf.get_variable_scope().reuse_variables()
all_predictions['classes'].append(tf.argmax(logits, axis=1))
all_predictions['probabilities'].append(tf.nn.softmax(logits))
if mode == tf.estimator.ModeKeys.TRAIN:
# Calculate loss, which includes softmax cross entropy and L2 regularization.
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=device_labels)
reg_loss = FLAGS.weight_decay * tf.add_n(
[tf.nn.l2_loss(v) for v in tf.trainable_variables()])
all_cross_entropy.append(cross_entropy)
all_reg_loss.append(reg_loss)
all_predictions['classes'] = tf.reshape(all_predictions['classes'], [-1])
all_predictions['probabilities'] = tf.reshape(
all_predictions['probabilities'], [-1])
total_cross_entropy = tf.add_n(all_cross_entropy)
total_reg_loss = tf.add_n(all_reg_loss)
total_loss = total_cross_entropy + total_reg_loss
tf.identity(total_cross_entropy, name='cross_entropy')
tf.summary.scalar('cross_entropy', total_cross_entropy)
tf.summary.scalar('reg_loss', total_reg_loss)
tf.summary.scalar('total_loss', total_loss)
if mode == tf.estimator.ModeKeys.TRAIN:
global_step = tf.train.get_or_create_global_step()
boundaries = [
int(batches_per_epoch * epoch) for epoch in [30, 60, 120, 150]]
values = [
_INITIAL_LEARNING_RATE * decay for decay in [1, 0.1, 0.01, 1e-3, 1e-4]]
learning_rate = tf.train.piecewise_constant(
tf.cast(global_step, tf.int32), boundaries, values)
tf.identity(learning_rate, name='learning_rate')
tf.summary.scalar('learning_rate', learning_rate)
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=_MOMENTUM)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(total_loss, global_step)
else:
train_op = None
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=all_predictions,
loss=total_loss,
train_op=train_op)
错误消息是:
INFO:tensorflow:Saving checkpoints for 1 into F:\projects\DeepLearning\TensorFlow\Models\ImageNet\resnet_101_imagenet_augmented\temp\model.ckpt.
INFO:tensorflow:learning_rate = 0.003125, cross_entropy = 14.394
INFO:tensorflow:loss = 30.0782, step = 1
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "imagenet_main.py", line 321, in <module>
tf.app.run()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "imagenet_main.py", line 310, in main
hooks=[logging_hook])
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 686, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 518, in run
run_metadata=run_metadata)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 862, in run
run_metadata=run_metadata)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 818, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 980, in run
run_metadata=run_metadata))
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 551, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.