多分支和多GPU无法使检查点“名称已退出”

时间:2019-03-28 02:31:00

标签: tensorflow keras

#It's been bothering me for a long time
#my problem is when i use keras, i cant make checkpoint.

Epoch 00001: saving model to ./models/triplet_models_continue/InceptionV3_Triplet_epoch=0001-loss=7.2682-modelAcc=0.0000-colorAcc=1.0000-val_loss=25.1771-val_modelAcc=0.0000-val_colorAcc=0.5000.h5
Traceback (most recent call last):
  File "Method_3_train_triplet_ConvNets_Model_Color.py", line 190, in <module>
    max_queue_size = 10, workers = 1, use_multiprocessing=True)
  File "/root/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py",
     

第91行,在包装器中           return func(* args,** kwargs)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/engine/training.py”,   在fit_generator中的第1418行           initial_epoch = initial_epoch)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/engine/training_generator.py”,   在fit_generator中的第251行           callbacks.on_epoch_end(epoch,epoch_logs)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/callbacks.py”,行   79,在on_epoch_end           callback.on_epoch_end(epoch,logs)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/callbacks.py”,行   457,在on_epoch_end           self.model.save(文件路径,覆盖=真)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/engine/network.py”,   1090行,保存           save_model(自身,文件路径,覆盖,include_optimizer)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/engine/saving.py”,   第382行,在save_model中           _serialize_model(model,f,include_optimizer)         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/engine/saving.py”,   _serialize_model中的第114行           layer_group [名称] = val         文件“ /root/anaconda3/lib/python3.6/site-packages/keras/utils/io_utils.py”,   第216行,位于 setitem           '名称为“ {}”的组存在。'。format(attr))       KeyError:'无法设置属性。存在名为“ b \'conv2d_1 / kernel:0 \'”的组。'

#here is  my script:
import os
GPUS = "0,1,2,3"
os.environ["CUDA_VISIBLE_DEVICES"] = GPUS
from math import ceil
import numpy as np
import copy
import keras.backend as K
from keras.applications.inception_v3 import InceptionV3
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers import Dense, Input, concatenate, subtract, dot, Activation, add, merge, Lambda
from keras.models import Model
from keras.models import load_model
from keras.optimizers import SGD, RMSprop
from sklearn.utils import class_weight
from utils import generator_batch_triplet, generator_batch
from keras.utils import multi_gpu_model
from loss import triplet_loss, identity_loss, MARGIN
import pickle
np.random.seed(1024)

FINE_TUNE = False
SAVE_FILTERED_LIST = False
FINE_TUNE_ON_ATTRIBUTES = False
LEARNING_RATE = 0.00001
NBR_EPOCHS = 10
BATCH_SIZE = 4
IMG_WIDTH = 299
IMG_HEIGHT = 299
monitor_index = 'loss'
NBR_MODELS = 250
NBR_COLORS = 7
RANDOM_SCALE = True
nbr_gpus = len(GPUS.split(','))
INITIAL_EPOCH = 0



if __name__ == "__main__":

    if FINE_TUNE:
        model_path = './models/triplet_models_continue/InceptionV3_Triplet_epoch=0001-loss=7.2470-modelAcc=0.0000-colorAcc=1.0000-val_loss=25.1771-val_modelAcc=0.0000-val_colorAcc=0.5000.h5'
        print('Finetune and Loading {} ...'.format(model_path))
        model = load_model(model_path, custom_objects={'identity_loss': identity_loss, 'triplet_loss':
triplet_loss, 'MARGIN': MARGIN})
        INITIAL_EPOCH = 2

    elif FINE_TUNE_ON_ATTRIBUTES:
        print('Finetune on the attributes model ...')
        # Begin with Attributes pretrained weights.
        attributes_branch = load_model('./models/attributes_models/InceptionV3_vehicleModelColor_facs=1024_epoch=0004-loss=0.8892-modelAcc=0.8590-colorAcc=0.8990-val_loss=0.7493-val_modelAcc=0.8858-val_colorAcc=0.8890.h5')
        #attributes_branch.summary()#可以看模型结构
        attributes_branch.get_layer(name = 'global_average_pooling2d_1').name = 'f_base'
        f_base = attributes_branch.get_layer(name = 'f_base').output  # 1024-D

        anchor = attributes_branch.input
        positive = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3), name='positive')
        negative = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3), name='negative')

        # Attributes Branch
        f_acs = attributes_branch.get_layer(name = 'f_acs').output
        f_model = attributes_branch.get_layer(name = 'predictions_model').output
        f_color = attributes_branch.get_layer(name = 'predictions_color').output

        # Similarity Learning Branch
        f_sls1 = Dense(1024, name = 'sls1')(f_base)
        f_sls2 = concatenate([f_sls1, f_acs], axis = -1, name = 'sls1_concatenate')  # 1024-D
        # The author said that only layer ``SLS_2'' is applied ReLU since non-linearity
        # would disrupt the embedding learned in the layer ``SLS_1''.
        #f_sls2 = Activation('relu', name = 'sls1_concatenate_relu')(f_sls2)
        f_sls2 = Dense(1024, name = 'sls2')(f_sls2)
        f_sls2 = Activation('relu', name = 'sls2_relu')(f_sls2)
        # Non-linearity ?
        f_sls3 = Dense(256, name = 'sls3')(f_sls2)
        sls_branch = Model(inputs = attributes_branch.input, outputs = f_sls3)
        for layer in sls_branch.layers:
            layer.name=layer.name+"m1"
        f_sls3_anchor = sls_branch(anchor)
        f_sls3_positive = sls_branch(positive)
        f_sls3_negative = sls_branch(negative)

        loss = Lambda(triplet_loss,
                  output_shape=(1, ))([f_sls3_anchor, f_sls3_positive, f_sls3_negative])

        model = Model(inputs = [anchor, positive, negative], outputs = [f_model, f_color, loss])
        for layer in model.layers:
            layer.name=layer.name+"m1"
        sls_branch.summary()
        model.summary()
    else:
        # Begin with Imagenet pretrained weights.
        print('Loading InceptionV3 Weights from ImageNet Pretrained ...')
        inception = InceptionV3(include_top=False, weights= None,
               input_tensor=None, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3), pooling = 'avg')
        f_base = inception.get_layer(index = -1).output     # shape=(None, 1, 1, 2048)
        #inception.summary()  # 可以看模型结构
        anchor = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3), name='anchor')
        positive = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3), name='positive')
        negative = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3), name='negative')
        # Attributes-Model-Color Branch
        f_acs = Dense(1024, name='f_acs')(f_base)
        attributes_branch = Model(inputs= inception.input, outputs = f_acs)
        f_anchor_acs = attributes_branch(anchor)
        f_model = Dense(NBR_MODELS, activation='softmax', name='predictions_model')(f_anchor_acs)
        f_color = Dense(NBR_COLORS, activation='softmax', name='predictions_color')(f_anchor_acs)

        # Similarity Learning Branch
        f_sls1 = Dense(1024, name = 'sls1')(f_base)
        f_sls2_c = concatenate([f_sls1, f_acs], axis = -1)
        # Non-linearity ?
        f_sls2 = Dense(1024, name = 'sls2')(f_sls2_c)
        # Non-linearity ?
        f_sls3 = Dense(256, name = 'sls3')(f_sls2)
        sls_branch = Model(inputs = inception.input, outputs = f_sls3)
        for layer in sls_branch.layers:
            layer.name=layer.name+"m1"
        f_sls3_anchor = sls_branch(anchor)
        f_sls3_positive = sls_branch(positive)
        f_sls3_negative = sls_branch(negative)

        loss = Lambda(triplet_loss,
                  output_shape=(1, ))([f_sls3_anchor, f_sls3_positive, f_sls3_negative])

        model = Model(inputs = [anchor, positive, negative], outputs = [f_model, f_color, loss])
        for layer in model.layers:
            layer.name=layer.name+"m2"
        sls_branch.summary()
        model.summary()
    print('Training model begins...')

    optimizer = SGD(lr = LEARNING_RATE, momentum = 0.9, decay = 0.0, nesterov = True)
    #optimizer = RMSprop(lr = LEARNING_RATE)

    if nbr_gpus > 1:
        print('Using {} GPUS.\n'.format(nbr_gpus))
        model = multi_gpu_model(model, gpus = nbr_gpus)
        BATCH_SIZE *= nbr_gpus
    else:
        print('Using a single GPU.\n')
    model.compile(loss=["categorical_crossentropy", "categorical_crossentropy", identity_loss],
                  optimizer=optimizer, metrics=["accuracy"])

    #model.summary()

    model_file_saved = "./models/triplet_models_continue/InceptionV3_Triplet_epoch={epoch:04d}-loss={loss:.4f}-modelAcc={predictions_model_acc:.4f}-colorAcc={predictions_color_acc:.4f}-val_loss={val_loss:.4f}-val_modelAcc={val_predictions_model_acc:.4f}-val_colorAcc={val_predictions_color_acc:.4f}.h5"
    # Define several callbacks

    checkpoint = ModelCheckpoint(model_file_saved, verbose = 1)

    reduce_lr = ReduceLROnPlateau(monitor='val_'+monitor_index, factor=0.5,
                  patience=5, verbose=1, min_lr=0.00001)

    early_stop = EarlyStopping(monitor='val_'+monitor_index, patience=15, verbose=1)

    trian_f = open("triple.pkl","rb")
    train_data = pickle.load(trian_f)
    trian_f.close()

    dic_f = open("dic.pkl","rb")
    dic_data = pickle.load(dic_f)
    dic_f.close()

    test_f = open("test.pkl","rb")
    test_data = pickle.load(test_f)
    test_f.close()


    train_data_lines = train_data[0:4]
    dic_train_data_lines = dic_data
    nbr_train = len(train_data_lines)
    print('# Train Images: {}.'.format(nbr_train))
    steps_per_epoch = int(ceil(nbr_train * 1. / BATCH_SIZE))


    val_data_lines = test_data[0:4]
    nbr_val = len(val_data_lines)
    print('# Val Images: {}.'.format(nbr_val))
    validation_steps = int(ceil(nbr_val * 1. / BATCH_SIZE))

    model.fit_generator(generator_batch_triplet(train_data_lines, dic_train_data_lines,
                        mode = 'train', nbr_class_one = NBR_MODELS, nbr_class_two = NBR_COLORS,
                        batch_size = BATCH_SIZE, img_width = IMG_WIDTH,
                        img_height = IMG_HEIGHT, random_scale = RANDOM_SCALE,
                        shuffle = True, augment = True),
                        steps_per_epoch = steps_per_epoch, epochs = NBR_EPOCHS, verbose = 1,
                        validation_data = generator_batch_triplet(val_data_lines, { },
                        mode = 'val', nbr_class_one = NBR_MODELS, nbr_class_two = NBR_COLORS,
                        batch_size = BATCH_SIZE, img_width = IMG_WIDTH, img_height = IMG_HEIGHT,
                        shuffle = False, augment = False),
                        validation_steps = validation_steps,
                        callbacks = [checkpoint], initial_epoch = INITIAL_EPOCH,
                        max_queue_size = 10, workers = 1, use_multiprocessing=True)

0 个答案:

没有答案