我正在尝试为二进制图像分类问题训练VGG19模型。我的数据集不适合内存,因此我使用批次和.fit_generator
的{{1}}函数。
但是,即使尝试批量训练,我也会收到以下错误:
W tensorflow / core / common_runtime / bfc_allocator.cc:275]跑出来了 内存试图分配392.00MiB。请参阅日志以了解内存状态。
W tensorflow / core / framework / op_kernel.cc:975]资源耗尽:OOM 在分配具有形状的张量时
这是启动训练脚本时关于我的GPU的控制台输出:
model
我不知道,但我认为1.5+ GB应足以培养小批量,对吗?
脚本的完整输出非常大,我会将其中的一部分粘贴到this pastebin。
以下是我的模型的代码:
Using TensorFlow backend.
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so locally
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
name: GeForce GT 750M
major: 3 minor: 0 memoryClockRate (GHz) 1.085
pciBusID 0000:01:00.0
Total memory: 1.95GiB
Free memory: 1.74GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0)
我按照以下方式运行模型:
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
class VGG19(object):
def __init__(self, weights_path=None, train_folder='data/train', validation_folder='data/val'):
self.weights_path = weights_path
self.model = self._init_model()
if weights_path:
self.model.load_weights(weights_path)
else:
self.datagen = self._datagen()
self.train_folder = train_folder
self.validation_folder = validation_folder
self.model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
def fit(self, batch_size=32, nb_epoch=10):
train_generator = self.datagen.flow_from_directory(
self.train_folder, target_size=(224, 224),
color_mode='rgb', class_mode='binary',
batch_size=2
)
validation_generator = self.datagen.flow_from_directory(
self.validation_folder, target_size=(224, 224),
color_mode='rgb', class_mode='binary',
batch_size=2
)
self.model.fit_generator(
train_generator,
samples_per_epoch=16,
nb_epoch=1,
verbose=1,
validation_data=validation_generator,
callbacks=[
TensorBoard(log_dir='./logs', write_images=True),
ModelCheckpoint(filepath='weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss'),
ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.001)
],
nb_val_samples=8
)
def evaluate(self, X, y, batch_size=32):
return self.model.evaluate(
X, y,
batch_size=batch_size,
verbose=1
)
def predict(self, X, batch_size=4, verbose=1):
return self.model.predict(X, batch_size=batch_size, verbose=verbose)
def predict_proba(self, X, batch_size=4, verbose=1):
return self.model.predict_proba(X, batch_size=batch_size, verbose=verbose)
def _init_model(self):
model = Sequential()
model.add(ZeroPadding2D((1, 1), input_shape=(224, 224, 3)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))
return model
def _datagen(self):
return ImageDataGenerator(
featurewise_center=True,
samplewise_center=False,
featurewise_std_normalization=True,
samplewise_std_normalization=False,
zca_whitening=False,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
vertical_flip=True
)
我的vgg19 = VGG19(train_folder='data/train/train', validation_folder='data/val/val')
vgg19.fit(nb_epoch=1)
和data/train/train
个文件夹分别包含两个目录:data/val/val
和cats
,因此dogs
函数可以正确地分隔我的类。< / p>
我在这里做错了什么? VGG19对我的机器来说太大了还是批量大小的问题?
如何在我的机器上训练模型?
PS:如果我没有输入训练脚本(即使它输出了许多类似的错误,如上面的pastebin中的那个),输出的最后几行如下:
ImageDataGenerator.flow_from_directory()
根据@ rmeertens的建议,我将最后的Dense图层缩小了:
最后一个块:
W tensorflow/core/common_runtime/bfc_allocator.cc:274] *****************************************************************************************xxxxxxxxxxx
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 392.00MiB. See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:975] Resource exhausted: OOM when allocating tensor with shape[25088,4096]
Traceback (most recent call last):
File "train.py", line 6, in <module>
vgg19.fit(nb_epoch=1)
File "/home/denis/WEB/DeepLearning/CatsVsDogs/model/vgg19.py", line 84, in fit
nb_val_samples=8
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 907, in fit_generator
pickle_safe=pickle_safe)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1378, in fit_generator
callbacks._set_model(callback_model)
File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 32, in _set_model
callback._set_model(model)
File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 493, in _set_model
self.sess = KTF.get_session()
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 111, in get_session
_initialize_variables()
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 200, in _initialize_variables
sess.run(tf.variables_initializer(uninitialized_variables))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 766, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 964, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1014, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1034, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[4096]
[[Node: Variable_43/Assign = Assign[T=DT_FLOAT, _class=["loc:@Variable_43"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Variable_43, Const_59)]]
Caused by op u'Variable_43/Assign', defined at:
File "train.py", line 6, in <module>
vgg19.fit(nb_epoch=1)
File "/home/denis/WEB/DeepLearning/CatsVsDogs/model/vgg19.py", line 84, in fit
nb_val_samples=8
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 907, in fit_generator
pickle_safe=pickle_safe)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1351, in fit_generator
self._make_train_function()
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 696, in _make_train_function
self.total_loss)
File "/usr/local/lib/python2.7/dist-packages/keras/optimizers.py", line 387, in get_updates
ms = [K.zeros(shape) for shape in shapes]
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 278, in zeros
dtype, name)
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 182, in variable
v = tf.Variable(value, dtype=_convert_string_dtype(dtype), name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 224, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 360, in _init_from_args
validate_shape=validate_shape).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 47, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[4096]
[[Node: Variable_43/Assign = Assign[T=DT_FLOAT, _class=["loc:@Variable_43"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Variable_43, Const_59)]]
并且错误有所改变。但它仍然是一个OOM错误:pastebin.com/SamkUbJA
答案 0 :(得分:5)
如果你告诉我这个型号正常工作我会非常惊讶。
在1输出(最后一层)上激活softmax是没有意义的。 softmax将图层的输出标准化,使它们总和为1 ......如果您只有一个输出,它将始终为1!因此,如果您想要二进制概率,请在1输出上使用sigmoid或在2个输出上使用softmax!
答案 1 :(得分:4)
在这种情况下,出现OOM错误是因为图表太大。当一切都发生故障时,你试图分配的张量的形状是什么?
无论如何,你可以尝试的第一件事是分配模型而不在内存中存在任何数据。还有其他东西还在运行(另一个jupyter笔记本,后台还有其他一些模型服务)。
另外,也许你可以节省最后一层的空间:
model.add(Dense(4096, activation='relu'))
model.add(Dense(4096, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
4096x4096矩阵相当大(无论如何,立即回到1是一个坏主意;)
答案 2 :(得分:0)
就我而言,我按照以下方式解决了问题:
更改Convolution2D
- &gt; Conv2D
from keras.layers import Conv2D
model.add(Convolution2D(64, 3, 3, activation='relu')) -> model.add(Conv2D(64, 3, 3, activation='relu'))