Question

我正在尝试使用Google Colab在我的自定义数据集（其中只有一个用于“人”的类）上从keras-yolov3训练这个keras-yolov3模型。在使用完整Yolo并冻结输出层的前50个时间段后，Tesla T4 GPU出现GPU内存RunOutOfMemory错误。因此，我尝试训练模型直到前50个时期，然后将模型权重保存在logs / 000 / train_weights_stage_1.h5中，然后使用新的权重重新加载模型。

这是来自train.py的代码，我用于训练前50个纪元。

"""
Retrain the YOLO model for your own dataset.
"""

"""
I am removing the checkpointing, since it takes a lot of space and training 
Yolov3 model doesn't take much time, just saving end weights
"""

import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data


def _main():
    annotation_path = 'model_data/annotations.txt'
    log_dir = 'logs/000/'
    classes_path = 'model_data/people_tracking_classes.txt'
    anchors_path = 'model_data/yolo_anchors.txt'
    class_names = get_classes(classes_path)
    num_classes = len(class_names)
    anchors = get_anchors(anchors_path)

    input_shape = (416,416) # multiple of 32, hw

    is_tiny_version = len(anchors)==6 # default setting
    if is_tiny_version:
        model = create_tiny_model(input_shape, anchors, num_classes,
            freeze_body=2, weights_path='model_data/tiny_yolo.h5')
    else:
        model = create_model(input_shape, anchors, num_classes,
            freeze_body=2, weights_path='model_data/yolo.h5') # make sure you know what you freeze

    logging = TensorBoard(log_dir=log_dir) #SEE THIS
    checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3, verbose=0)
    #verbose=1 isn't that beneficial.
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

    val_split = 0.1
    with open(annotation_path) as f:
        lines = f.readlines()
    np.random.seed(10101)
    np.random.shuffle(lines)
    print(lines)
    np.random.seed(None)
    num_val = int(len(lines)*val_split)
    num_train = len(lines) - num_val

    # Train with frozen layers first, to get a stable loss.
    # Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
    if True:
        model.compile(optimizer=Adam(lr=1e-3), loss={
            # use custom yolo_loss Lambda layer.
            'yolo_loss': lambda y_true, y_pred: y_pred})



        batch_size = 32
        print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
        history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                steps_per_epoch=max(1, num_train//batch_size),
                validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                validation_steps=max(1, num_val//batch_size),
                epochs=50,
                initial_epoch=0,
                callbacks=[logging]) #checkpoint])
        model.save_weights(log_dir + 'trained_weights_stage_1.h5')
        model.save(log_dir + 'trained_model_stage_1.h5')
        json_string = model.to_json()
        print('Saved Model Architecture to:  {}'.format(json_string))


    # Unfreeze and continue training, to fine-tune.
    # Train longer if the result is not good.
    """if True:
        for i in range(len(model.layers)):
            model.layers[i].trainable = True
        model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
        print('Unfreeze all of the layers.')

        batch_size = 32 # note that more GPU memory is required after unfreezing the body
        print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
        #import pdb; pdb.set_trace()
        history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                      steps_per_epoch=max(1, num_train//batch_size),
                      validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                      validation_steps=max(1, num_val//batch_size),
                      epochs=100,
                      initial_epoch=50,
                      callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
        model.save_weights(log_dir + 'trained_weights_final.h5')"""

    # Further training if needed.


def get_classes(classes_path):
    '''loads the classes'''
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

def get_anchors(anchors_path):
    '''loads the anchors from a file'''
    with open(anchors_path) as f:
        anchors = f.readline()
    anchors = [float(x) for x in anchors.split(',')]
    return np.array(anchors).reshape(-1, 2)


def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
            weights_path='model_data/yolo.h5'):
    '''create the training model'''
    K.clear_session() # get a new session
    image_input = Input(shape=(None, None, 3))
    h, w = input_shape
    num_anchors = len(anchors)

    #This I don't know what's happening, can understand the python, but not the
    # Logic behind y_true, but seems like 3 output layers of yolov3
    y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
        num_anchors//3, num_classes+5)) for l in range(3)]

    model_body = yolo_body(image_input, num_anchors//3, num_classes)
    print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))



    if load_pretrained:
      """model.load_weights(filepath, by_name=False) loads the weights of 
      the model from a HDF5 file (created by save_weights). By default, the
      architecture is expected to be unchanged. To load weights into a different
      architecture (with some layers in common), use by_name=True to load only 
      those layers with the same name.
      """
      model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
      #skip_mismatch
      print(len(model_body.layers)) # To find out the number of layers, should
      # be 185.

      print('Load weights {}.'.format(weights_path))
      if freeze_body in [1, 2]:
          # Freeze darknet53 body or freeze all but 3 output layers.
          num = (185, len(model_body.layers)-3)[freeze_body-1]
          for i in range(num): model_body.layers[i].trainable = False
          print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))

    model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
        [*model_body.output, *y_true])
    model = Model([model_body.input, *y_true], model_loss)

    return model

def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
            weights_path='model_data/tiny_yolo.h5'):
    '''create the training model, for Tiny YOLOv3'''
    K.clear_session() # get a new session
    image_input = Input(shape=(None, None, 3))
    h, w = input_shape
    num_anchors = len(anchors)

    y_true = [Input(shape=(h//{0:32, 1:16}[l], w//{0:32, 1:16}[l], \
        num_anchors//2, num_classes+5)) for l in range(2)]

    model_body = tiny_yolo_body(image_input, num_anchors//2, num_classes)
    print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))

    if load_pretrained:
        model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
        print('Load weights {}.'.format(weights_path))
        if freeze_body in [1, 2]:
            # Freeze the darknet body or freeze all but 2 output layers.
            num = (20, len(model_body.layers)-2)[freeze_body-1]
            for i in range(num): model_body.layers[i].trainable = False
            print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))

    model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})(
        [*model_body.output, *y_true])
    model = Model([model_body.input, *y_true], model_loss)

    return model

def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes):
    '''data generator for fit_generator'''
    n = len(annotation_lines)
    i = 0
    while True:
        image_data = []
        box_data = []
        for b in range(batch_size):
            if i==0:
                np.random.shuffle(annotation_lines)
            image, box = get_random_data(annotation_lines[i], input_shape, random=True)
            image_data.append(image)
            box_data.append(box)
            i = (i+1) % n
        image_data = np.array(image_data)
        box_data = np.array(box_data)
        y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
        yield [image_data, *y_true], np.zeros(batch_size)

def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes):
    n = len(annotation_lines)
    if n==0 or batch_size<=0: return None
    return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes)

if __name__ == '__main__':
    _main()


# Explaining the Skipping loading of weights warning below.


"""These layers are the three output layers of the yolo network. You probably 
got the message because you changed the number of classes and therefore the 
shape of the last conv layers will change aswell. So you can just ignore the 
message, the network will work just fine."""

在完成前50个纪元之后，就像之前所说的，我将训练好的权重加载到模型中，并尝试通过解冻所有图层来对其进行微调。如下所示：

from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import numpy as np
from keras.layers import Input, Lambda




from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data


import keras.losses
keras.losses.custom_loss = yolo_loss

log_dir = 'logs/000/'
annotation_path = 'model_data/annotations.txt'
val_split = 0.1

with open(annotation_path) as f:
    lines = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines)
print(lines)
np.random.seed(None)
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val

checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=10, verbose=0)



classes_path = 'model_data/new_classes.txt'


def get_classes(classes_path):
    '''loads the classes'''
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

class_names = get_classes(classes_path)  
num_classes = len(class_names)


from yolo3.model import yolo_head, yolo_body
from keras.layers import Input, Lambda
from keras.utils import CustomObjectScope


model = yolo_body(Input(shape=(None, None, 3)), 3, num_classes)
model.load_weights('logs/000/trained_weights_stage_1.h5', by_name=True) 


# Unfreeze and continue training, to fine-tune.
# Train longer if the result is not good.
if True:
    for i in range(len(model.layers)):
        model.layers[i].trainable = True
    model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
    print('Unfreeze all of the layers.')

    batch_size = 32 # note that more GPU memory is required after unfreezing the body
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    #import pdb; pdb.set_trace()
    history = model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
                  steps_per_epoch=max(1, num_train//batch_size),
                  validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
                  validation_steps=max(1, num_val//batch_size),
                  epochs=100,
                  initial_epoch=50,
                  callbacks=[logging, reduce_lr, early_stopping]) #removed checkpointing.
    model.save_weights(log_dir + 'trained_weights_final.h5')

  # Further training if needed.`

在运行上面的代码时，我收到以下ValueError：

ValueError                                Traceback (most recent call last)
<ipython-input-30-d6473c9875b2> in <module>()
     61     for i in range(len(model.layers)):
     62         model.layers[i].trainable = True
---> 63     model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
     64     print('Unfreeze all of the layers.')
     65 

/usr/local/lib/python3.6/dist-packages/keras/engine/training.py in compile(self, optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, target_tensors, **kwargs)
    117                                      'dictionary: "' + name + '". '
    118                                      'Only expected the following keys: ' +
--> 119                                      str(self.output_names))
    120             loss_functions = []
    121             for name in self.output_names:

ValueError: Unknown entry in loss dictionary: "yolo_loss". Only expected the following keys: ['conv2d_209', 'conv2d_217', 'conv2d_225']

不知如何消除此错误并使用第一部分中保存的权重训练完整模型。另外，我尝试从model.save('trained_model_stage_1.h5')加载完整模型，并使用model.load_model('trained_model_stage_1.h5', custom_objects={'yolo_loss' : yolo_loss})，但这对我也不起作用，并且出现yolo_head is not defined.

错误

ValueError：丢失字典中的未知条目：“ yolo_loss”。只希望使用以下键：Yolo的['conv2d_209'，'conv2d_217'，'conv2d_225']

0 个答案: