Tensorflows MirroredStrategy()
是否将训练模型划分为多个GPU?我正在尝试运行3D-UNet,并且我在单个GPU上的训练数据量限制为224x224x224。我正在尝试实现MirroredStrategy()
和with tf.device():
来将模型的一部分传递给第二个GPU。我仍然无法通过224x224x224的限制。如果我调大音量,我会得到ResourceExhaustedError
。
代码:
def get_model(optimizer, loss_metric, metrics, lr=1e-3):
with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
inputs = Input((sample_width, sample_height, sample_depth, 1))
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
drop1 = Dropout(0.5)(pool1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
drop2 = Dropout(0.5)(pool2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
drop3 = Dropout(0.3)(pool3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
drop4 = Dropout(0.3)(pool4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)
with tf.device('/job:localhost/replica:0/task:0/device:GPU:1'):
up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)
up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)
up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)
up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)
conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)
with tf.device('/job:localhost/replica:0/task:0/device:CPU:0'):
model = Model(inputs=[inputs], outputs=[conv10])
model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
return model
mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/job:localhost/replica:0/task:0/device:GPU:0", "/job:localhost/replica:0/task:0/device:GPU:1"],
cross_device_ops = tf.distribute.HierarchicalCopyAllReduce())
with mirrored_strategy.scope():
model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)
ResourceExhaustedError:
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-1-7a601312fa7a> in <module>
405 # e_drive_model_dir = '\\models\\'
406 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, save_freq = 1000)
--> 407 model.fit(train_x, train_y, batch_size= 2, epochs= 10000, verbose=1, shuffle=True, validation_split=0, callbacks=[model_checkpoint])
408
409 model.save('unet_seg_final_3d_test.model')
~\.conda\envs\gputest\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
647 steps_per_epoch=steps_per_epoch,
648 validation_steps=validation_steps,
--> 649 validation_freq=validation_freq)
650
651 batch_size = self._validate_or_infer_batch_size(
~\.conda\envs\gputest\lib\site-packages\tensorflow\python\keras\engine\training_distributed.py in fit_distributed(model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq)
141 validation_steps=validation_steps,
142 validation_freq=validation_freq,
--> 143 steps_name='steps_per_epoch')
144
145
~\.conda\envs\gputest\lib\site-packages\tensorflow\python\keras\engine\training_arrays.py in model_iteration(model, inputs, targets, sample_weights, batch_size, epochs, verbose, callbacks, val_inputs, val_targets, val_sample_weights, shuffle, initial_epoch, steps_per_epoch, validation_steps, validation_freq, mode, validation_in_fit, prepared_feed_values_from_dataset, steps_name, **kwargs)
272 # `ins` can be callable in tf.distribute.Strategy + eager case.
273 actual_inputs = ins() if callable(ins) else ins
--> 274 batch_outs = f(actual_inputs)
275 except errors.OutOfRangeError:
276 if is_dataset:
~\.conda\envs\gputest\lib\site-packages\tensorflow\python\keras\backend.py in __call__(self, inputs)
3290
3291 fetched = self._callable_fn(*array_vals,
-> 3292 run_metadata=self.run_metadata)
3293 self._call_fetch_callbacks(fetched[-len(self._fetches):])
3294 output_structure = nest.pack_sequence_as(
~\.conda\envs\gputest\lib\site-packages\tensorflow\python\client\session.py in __call__(self, *args, **kwargs)
1456 ret = tf_session.TF_SessionRunCallable(self._session._session,
1457 self._handle, args,
-> 1458 run_metadata_ptr)
1459 if run_metadata:
1460 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[1,32,240,240,240] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node Adam/gradients/conv3d_17_1/Conv3D_grad/Conv3DBackpropInputV2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[1,32,240,240,240] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node Adam/gradients/conv3d_17_1/Conv3D_grad/Conv3DBackpropInputV2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[GroupCrossDeviceControlEdges_0/Adam/Adam/update_1/Const/_1070]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
1 successful operations.
0 derived errors ignored.
答案 0 :(得分:0)
尽管已经很晚了,但我希望这个答案将来能对其他人有所帮助。
请注意,这已经在TF 2.0上进行了测试,因此它可能不适用于较早的版本。
对问题第一部分的简短回答:
MirroredStrategy()
不会在单独的GPU上拆分模型;它在每个模型上复制模型并拆分批次。因此,如果在2台GPU机器上使用批处理大小为32的模型进行训练,则每个GPU将获得16个示例。累计了所有32个示例的梯度并更新了模型。
如何将模型本身拆分?
经过反复试验,我有以下内容:
您可以在单独的设备上拥有单独的图层和操作,但是一旦将它们包装在tf.keras.Model
的单个实例下,就只能在单个设备上调用整个模型。
可以在模型外部引用和使用模型中的图层,这些图层可以作为单独的操作,而不能作为一个整体。
在保存和还原模型时,您可以仅还原权重,然后使用第2点中指定的权重以及没有变量的新图层实例来摆脱困境。
结合这三点,一种在多个GPU上进行模型拆分以进行训练和推理的方法是,首先在单个GPU上创建图形(tf.keras.Model
),然后在单独的GPU上复制各个组件。
最低限度的例子:
def create_model():
input = Input((None, None, 3))
x = Conv2D(64, (3, 3), activation='relu')(input)
y = Conv2D(64, (3, 3), activation='relu')(input)
z = Concatenate()([x, y])
output = Conv2D(3, (3, 3), activation='sigmoid')(z)
return tf.keras.Model(inputs=[input], outputs=[output])
def model_graph(input, model):
# get all layers that contain trainable parameters
layers = []
for layer in model.layers:
if len(layer.trainable_variables) != 0:
layers.append(layer)
# use the list to access layers with trainable variables
layer_num = 0
with tf.device('/gpu:0'):
x = layers[layer_num](input); layer_num += 1
with tf.device('/gpu:1'):
y = layers[layer_num](input); layer_num += 1
# You can create new instances of layers that don't have variables
z = Concatenate()([x, y])
output = layers[layer_num](z)
return output
model = create_model()
要在单个设备上使用模型时,可以使用:
output = model(inputs)
当您将其拆分到两个设备上时,可以使用:
output = model_graph(model, inputs)