Question

我目前正在研究一个项目，尝试比较serval神经网络，其中一个模型是将cifar10数据集与在VGG16之上训练的模型进行分类。

在训练其他模型时，我没有任何问题，但是使用VGG16时我会遇到内存不足错误，尽管我不想训练有问题的图层。

import numpy as np
import os
import matplotlib.pyplot as plt
import keras

from keras.models import Model
from keras.models import Sequential
from keras import layers
from keras.applications.vgg16 import VGG16
import keras.losses as losses
import keras.optimizers as optimizers

from keras.datasets import mnist
from keras.datasets import cifar10

from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

import cv2

cwd =os.getcwd()


#defining later used variables such as data paths and img sizes
which_data = 'cifar10'

img_size =32
img_depth =3
batch_size=20
class_count = 10
use_percentage=0.5

input_shape=(img_size, img_size, img_depth)

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

x_train = x_train[:int((x_train.shape[0]*use_percentage))]
y_train = y_train[:int((y_train.shape[0]*use_percentage))]


x_test = x_test[:int((x_test.shape[0]*use_percentage))]
y_test = y_test[:int((y_test.shape[0]*use_percentage))]


print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'reduced train samples')
print(x_test.shape[0], 'reduced test samples')

y_train = keras.utils.to_categorical(y_train, class_count)
y_test = keras.utils.to_categorical(y_test, class_count)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255


steps_per_epoch= x_train.shape[0]//batch_size
validation_steps =x_test.shape[0]//batch_size
epochs= 20
verbose =1 ### information output


##VGG16 Model
input_tensor = layers.Input(shape=input_shape)
model=VGG16(weights='imagenet', include_top=False,input_tensor=input_tensor)

layer_dict =dict([(layer.name, layer) for layer in model.layers])


X = layer_dict['block5_pool'].output

X = layers.Flatten()(X)
X = layers.Dense(class_count, activation='softmax')(X)



transfer_simple = Model(inputs=model.input, outputs=X)

for layer in transfer_simple.layers[:19]:
    layer.trainable = False


print(transfer_simple.summary())


transfer_simple.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.SGD(lr=0.02),metrics=['accuracy'])


ts_history=transfer_simple.fit(
        x_train,
        y_train,
        validation_data=(x_test, y_test),
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        epochs=epochs,
        verbose=verbose,
        shuffle=True
)


transfer_simple.save_weights(which_data+'/advanced_cnn'+epochs+'_weights.h5')

该模型的摘要指出，只有5,130个可训练参数，但我仍然收到以下错误：

ResourceExhaustedError: OOM when allocating tensor with shape[25000,64,32,32] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
     [[Node: block1_conv1_5/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](block1_conv1_5/convolution-0-TransposeNHWCToNCHW-LayoutOptimizer, block1_conv1_5/kernel/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

     [[Node: loss_5/mul/_1255 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_402_loss_5/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

我精打细算，真的希望你能帮助我！

Answer 1

虽然我没有找到解决此问题的方法，但还是找到了解决方法。我没有使用fit（）函数来训练我的神经网络，而是使用了以下内容：


training_stream = ImageDataGenerator().flow(x=x_train,y=y_train,batch_size=batch_size,shuffle=True
        )
validation_stream = ImageDataGenerator().flow(x=x_test,y=y_test,batch_size=batch_size,shuffle=True
        )

ts_history=transfer_simple.fit_generator(
        training_stream,
        validation_data=validation_stream,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        epochs=epochs,
        verbose=verbose,
        shuffle=True
)

这似乎可以以较小的块处理输入数据，并消除了OOM错误

Answer 2

运行OOM的原因很多，但首先要检查的是

您的batch_size大吗？减小batch_size
当前会话中发生了奇怪的事情：tf.keras.backend.clear_session()通常会有所帮助，有时您必须重新启动内核
您的网络太大了吗？

尝试估计您的内存需求：

def get_model_memory_usage(batch_size, model):
shapes_mem_count = 0
for l in model.layers:
    single_layer_mem = 1
    for s in l.output_shape:
        if s is None:
            continue
        single_layer_mem *= s
    shapes_mem_count += single_layer_mem

trainable_count = np.sum([K.count_params(p) for p in set(model.trainable_weights)])
non_trainable_count = np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])

number_size = 4.0
if K.floatx() == 'float16':
     number_size = 2.0
if K.floatx() == 'float64':
     number_size = 8.0

total_memory = number_size*(batch_size*shapes_mem_count + trainable_count + non_trainable_count)
gbytes = np.round(total_memory / (1024.0 ** 3), 3)
return gbytes

我不想训练的图层上出现OOM错误（转移学习）

2 个答案: