我正在尝试使用Google Colab在我的自定义数据集(其中只有一个用于“人”的类)上从keras-yolov3训练这个keras-yolov3模型。在使用完整Yolo并冻结输出层的前50个时间段后,Tesla T4 GPU出现GPU内存RunOutOfMemory错误。因此,我尝试训练模型直到前50个时期,然后将模型权重保存在logs / 000 / train_weights_stage_1.h5中,然后使用新的权重重新加载模型。
这是来自train.py的代码,我用于训练前50个纪元。
"""
Retrain the YOLO model for your own dataset.
"""
"""
I am removing the checkpointing, since it takes a lot of space and training
Yolov3 model doesn't take much time, just saving end weights
"""
import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data
def _main():
annotation_path = 'model_data/annotations.txt'
log_dir = 'logs/000/'
classes_path = 'model_data/people_tracking_classes.txt'
anchors_path = 'model_data/yolo_anchors.txt'
class_names = get_classes(classes_path)
num_classes = len(class_names)
anchors = get_anchors(anchors_path)
input_shape = (416,416) # multiple of 32, hw
is_tiny_version = len(anchors)==6 # default setting
if is_tiny_version:
model = create_tiny_model(input_shape, anchors, num_classes,
freeze_body=2, weights_path='model_data/tiny_yolo.h5')
else:
model = create_model(input_shape, anchors, num_classes,
freeze_body=2, weights_path='model_data/yolo.h5') # make sure you know what you freeze
logging = TensorBoard(log_dir=log_dir) #SEE THIS
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
monitor='val_loss', save_weights_only=True, save_best_only=True, period=3, verbose=0)
#verbose=1 isn't that beneficial.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
val_split = 0.1
with open(annotation_path) as f:
lines = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines)
print(lines)
np.random.seed(None)
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val
# Train with frozen layers first, to get a stable loss.
# Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
if True:
model.compile(optimizer=Adam(lr=1e-3), loss={
# use custom yolo_loss Lambda layer.
'yolo_loss': lambda y_true, y_pred: y_pred})
batch_size = 32
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, num_val//batch_size),
epochs=50,
initial_epoch=0,
callbacks=[logging]) #checkpoint])
model.save_weights(log_dir + 'trained_weights_stage_1.h5')
model.save(log_dir + 'trained_model_stage_1.h5')
json_string = model.to_json()
print('Saved Model Architecture to: {}'.format(json_string))
# Unfreeze and continue training, to fine-tune.
# Train longer if the result is not good.
"""if True:
for i in range(len(model.layers)):
model.layers[i].trainable = True
model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
print('Unfreeze all of the layers.')
batch_size = 32 # note that more GPU memory is required after unfreezing the body
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
#import pdb; pdb.set_trace()
history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, num_val//batch_size),
epochs=100,
initial_epoch=50,
callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
model.save_weights(log_dir + 'trained_weights_final.h5')"""
# Further training if needed.
def get_classes(classes_path):
'''loads the classes'''
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
def get_anchors(anchors_path):
'''loads the anchors from a file'''
with open(anchors_path) as f:
anchors = f.readline()
anchors = [float(x) for x in anchors.split(',')]
return np.array(anchors).reshape(-1, 2)
def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
weights_path='model_data/yolo.h5'):
'''create the training model'''
K.clear_session() # get a new session
image_input = Input(shape=(None, None, 3))
h, w = input_shape
num_anchors = len(anchors)
#This I don't know what's happening, can understand the python, but not the
# Logic behind y_true, but seems like 3 output layers of yolov3
y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
num_anchors//3, num_classes+5)) for l in range(3)]
model_body = yolo_body(image_input, num_anchors//3, num_classes)
print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
if load_pretrained:
"""model.load_weights(filepath, by_name=False) loads the weights of
the model from a HDF5 file (created by save_weights). By default, the
architecture is expected to be unchanged. To load weights into a different
architecture (with some layers in common), use by_name=True to load only
those layers with the same name.
"""
model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
#skip_mismatch
print(len(model_body.layers)) # To find out the number of layers, should
# be 185.
print('Load weights {}.'.format(weights_path))
if freeze_body in [1, 2]:
# Freeze darknet53 body or freeze all but 3 output layers.
num = (185, len(model_body.layers)-3)[freeze_body-1]
for i in range(num): model_body.layers[i].trainable = False
print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))
model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
[*model_body.output, *y_true])
model = Model([model_body.input, *y_true], model_loss)
return model
def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
weights_path='model_data/tiny_yolo.h5'):
'''create the training model, for Tiny YOLOv3'''
K.clear_session() # get a new session
image_input = Input(shape=(None, None, 3))
h, w = input_shape
num_anchors = len(anchors)
y_true = [Input(shape=(h//{0:32, 1:16}[l], w//{0:32, 1:16}[l], \
num_anchors//2, num_classes+5)) for l in range(2)]
model_body = tiny_yolo_body(image_input, num_anchors//2, num_classes)
print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
if load_pretrained:
model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
print('Load weights {}.'.format(weights_path))
if freeze_body in [1, 2]:
# Freeze the darknet body or freeze all but 2 output layers.
num = (20, len(model_body.layers)-2)[freeze_body-1]
for i in range(num): model_body.layers[i].trainable = False
print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))
model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})(
[*model_body.output, *y_true])
model = Model([model_body.input, *y_true], model_loss)
return model
def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes):
'''data generator for fit_generator'''
n = len(annotation_lines)
i = 0
while True:
image_data = []
box_data = []
for b in range(batch_size):
if i==0:
np.random.shuffle(annotation_lines)
image, box = get_random_data(annotation_lines[i], input_shape, random=True)
image_data.append(image)
box_data.append(box)
i = (i+1) % n
image_data = np.array(image_data)
box_data = np.array(box_data)
y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
yield [image_data, *y_true], np.zeros(batch_size)
def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes):
n = len(annotation_lines)
if n==0 or batch_size<=0: return None
return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes)
if __name__ == '__main__':
_main()
# Explaining the Skipping loading of weights warning below.
"""These layers are the three output layers of the yolo network. You probably
got the message because you changed the number of classes and therefore the
shape of the last conv layers will change aswell. So you can just ignore the
message, the network will work just fine."""
在完成前50个纪元之后,就像之前所说的,我将训练好的权重加载到模型中,并尝试通过解冻所有图层来对其进行微调。 如下所示:
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import numpy as np
from keras.layers import Input, Lambda
from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data
import keras.losses
keras.losses.custom_loss = yolo_loss
log_dir = 'logs/000/'
annotation_path = 'model_data/annotations.txt'
val_split = 0.1
with open(annotation_path) as f:
lines = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines)
print(lines)
np.random.seed(None)
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
monitor='val_loss', save_weights_only=True, save_best_only=True, period=10, verbose=0)
classes_path = 'model_data/new_classes.txt'
def get_classes(classes_path):
'''loads the classes'''
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
class_names = get_classes(classes_path)
num_classes = len(class_names)
from yolo3.model import yolo_head, yolo_body
from keras.layers import Input, Lambda
from keras.utils import CustomObjectScope
model = yolo_body(Input(shape=(None, None, 3)), 3, num_classes)
model.load_weights('logs/000/trained_weights_stage_1.h5', by_name=True)
# Unfreeze and continue training, to fine-tune.
# Train longer if the result is not good.
if True:
for i in range(len(model.layers)):
model.layers[i].trainable = True
model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
print('Unfreeze all of the layers.')
batch_size = 32 # note that more GPU memory is required after unfreezing the body
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
#import pdb; pdb.set_trace()
history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, num_val//batch_size),
epochs=100,
initial_epoch=50,
callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
model.save_weights(log_dir + 'trained_weights_final.h5')
# Further training if needed.`
在运行上面的代码时,我收到以下ValueError:
ValueError Traceback (most recent call last)
<ipython-input-30-d6473c9875b2> in <module>()
61 for i in range(len(model.layers)):
62 model.layers[i].trainable = True
---> 63 model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
64 print('Unfreeze all of the layers.')
65
/usr/local/lib/python3.6/dist-packages/keras/engine/training.py in compile(self, optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, target_tensors, **kwargs)
117 'dictionary: "' + name + '". '
118 'Only expected the following keys: ' +
--> 119 str(self.output_names))
120 loss_functions = []
121 for name in self.output_names:
ValueError: Unknown entry in loss dictionary: "yolo_loss". Only expected the following keys: ['conv2d_209', 'conv2d_217', 'conv2d_225']
不知如何消除此错误并使用第一部分中保存的权重训练完整模型。另外,我尝试从model.save('trained_model_stage_1.h5')
加载完整模型,并使用model.load_model('trained_model_stage_1.h5', custom_objects={'yolo_loss' : yolo_loss})
,但这对我也不起作用,并且出现yolo_head is not defined.