我使用VGG16模型使用Keras 2.0.8(在Tensorflow 1.3后端)训练视频网络。问题是培训过程在一批数据上没有任何问题就开始了,但是在一些时期之后它会返回OOM错误。 Keras有内存泄漏问题吗?或者我在培训网络时犯了错误?我在这里发布了日志的最后一部分。这发生在20个时代的竞争之后。我自己写了发电机,但网络是VGG16架构。这项工作的硬件是1080 TI。 这是日志:
90/128 [====================>.........] - ETA: 162s - loss: 5.6777 - acc: 0.0504
91/128 [====================>.........] - ETA: 158s - loss: 5.6761 - acc: 0.0504
92/128 [====================>.........] - ETA: 153s - loss: 5.6753 - acc: 0.0507
93/128 [====================>.........] - ETA: 149s - loss: 5.6772 - acc: 0.0503
94/128 [=====================>........] - ETA: 145s - loss: 5.6765 - acc: 0.0504
Traceback (most recent call last):
File "/media/mrrl/FVolume2/abolfazl/Tensorflow/VGG16finetune.py", line 175, in <module>
model.fit_generator(generator(trainfile,labels,videolength,batch_size), steps_per_epoch=128, epochs=346, verbose=1, callbacks=[checkpointer,lrate], validation_data=generator_val(testfile,labels_val,videolength_val,batch_size), validation_steps=51)#, initial_epoch=187
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/models.py", line 1121, in fit_generator
initial_epoch=initial_epoch)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/engine/training.py", line 2042, in fit_generator
class_weight=class_weight)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/engine/training.py", line 1762, in train_on_batch
outputs = self.train_function(ins)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 2273, in __call__
**self.session_kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
options, run_metadata)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
ResourceExhaustedError: OOM when allocating tensor with shape[75,64,226,226]
[[Node: training/SGD/gradients/block1_pool/MaxPool_grad/MaxPoolGrad = MaxPoolGrad[T=DT_FLOAT, _class=["loc:@block1_pool/MaxPool"], data_format="NHWC", ksize=[1, 2, 2, 1], padding="VALID", strides=[1, 2, 2, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](block1_conv2/Relu, block1_pool/MaxPool, training/SGD/gradients/block2_conv1/convolution_grad/Conv2DBackpropInput)]]
Caused by op u'training/SGD/gradients/block1_pool/MaxPool_grad/MaxPoolGrad', defined at:
File "<string>", line 1, in <module>
File "/usr/lib/python2.7/idlelib/run.py", line 116, in main
ret = method(*args, **kwargs)
File "/usr/lib/python2.7/idlelib/run.py", line 324, in runcode
exec code in self.locals
File "/media/mrrl/FVolume2/abolfazl/Tensorflow/VGG16finetune.py", line 175, in <module>
model.fit_generator(generator(trainfile,labels,videolength,batch_size), steps_per_epoch=128, epochs=346, verbose=1, callbacks=[checkpointer,lrate], validation_data=generator_val(testfile,labels_val,videolength_val,batch_size), validation_steps=51)#, initial_epoch=187
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/models.py", line 1121, in fit_generator
initial_epoch=initial_epoch)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/engine/training.py", line 1926, in fit_generator
self._make_train_function()
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/engine/training.py", line 960, in _make_train_function
loss=self.total_loss)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/optimizers.py", line 156, in get_updates
grads = self.get_gradients(loss, params)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/optimizers.py", line 73, in get_gradients
grads = K.gradients(loss, params)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 2310, in gradients
return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in gradients
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 348, in _MaybeCompile
return grad_fn() # Exit early
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in <lambda>
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/nn_grad.py", line 526, in _MaxPoolGrad
data_format=op.get_attr("data_format"))
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1752, in _max_pool_grad
data_format=data_format, name=name)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
...which was originally created as op u'block1_pool/MaxPool', defined at:
File "<string>", line 1, in <module>
[elided 1 identical lines from previous traceback]
File "/usr/lib/python2.7/idlelib/run.py", line 324, in runcode
exec code in self.locals
File "/media/mrrl/FVolume2/abolfazl/Tensorflow/VGG16finetune.py", line 142, in <module>
model = VGG_16('my_model_weights.h5')
File "/media/mrrl/FVolume2/abolfazl/Tensorflow/VGG16finetune.py", line 22, in VGG_16
model.add(MaxPooling2D((2,2), strides=(2,2), name='block1_pool'))
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/models.py", line 475, in add
output_tensor = layer(self.outputs[0])
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/engine/topology.py", line 602, in __call__
output = self.call(inputs, **kwargs)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/layers/pooling.py", line 154, in call
data_format=self.data_format)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/layers/pooling.py", line 217, in _pooling_function
pool_mode='max')
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 3386, in pool2d
x = tf.nn.max_pool(x, pool_size, strides, padding=padding)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/nn_ops.py", line 1772, in max_pool
name=name)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1605, in _max_pool
data_format=data_format, name=name)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/mrrl/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[75,64,226,226]
[[Node: training/SGD/gradients/block1_pool/MaxPool_grad/MaxPoolGrad = MaxPoolGrad[T=DT_FLOAT, _class=["loc:@block1_pool/MaxPool"], data_format="NHWC", ksize=[1, 2, 2, 1], padding="VALID", strides=[1, 2, 2, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](block1_conv2/Relu, block1_pool/MaxPool, training/SGD/gradients/block2_conv1/convolution_grad/Conv2DBackpropInput)]]
这是VGG16结构,有20个通道输入层和101个类。
model = Sequential()
# Block 1
model.add(ZeroPadding2D(padding=(1,1), data_format="channels_last", input_shape=(224,224,20))) #20 channels in our work
print model.output_shape
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1m', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(MaxPooling2D((2,2), strides=(2,2), name='block1_pool'))
# Block 2
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(MaxPooling2D((2,2), strides=(2,2), name='block2_pool'))
# Block 3
model.add(Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3', kernel_regularizer=regularizers.l2(0.0005)))
model.add(MaxPooling2D((2,2), strides=(2,2), name='block3_pool'))
# Block 4
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3', kernel_regularizer=regularizers.l2(0.0005)))
model.add(MaxPooling2D((2,2), strides=(2,2), name='block4_pool'))
# Block 5
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3', kernel_regularizer=regularizers.l2(0.0005)))
model.add(MaxPooling2D((2,2), strides=(2,2), name='block5_pool'))
model.add(Flatten(name='flatten'))
model.add(Dense(4096, activation='relu', name='fc1', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Dropout(0.9))
model.add(Dense(4096, activation='relu', name='fc2', kernel_regularizer=regularizers.l2(0.0005)))
model.add(Dropout(0.8))
model.add(Dense(101, activation='softmax', name='predictionsnew', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(0.0005)))