重新加载模型并重新训练它,不能再做“model.fit”

时间:2021-06-09 17:16:50

标签: python tensorflow machine-learning deep-learning image-segmentation

def jacard_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (intersection + 1.0) / (K.sum(y_true_f) + K.sum(y_pred_f) - intersection + 1.0)

def jacard_coef_loss(y_true, y_pred):
    return -jacard_coef(y_true, y_pred)

if __name__ == "__main__":
    """ Seeding : make result reproducible"""
    np.random.seed(42)
    tf.random.set_seed(42)

    """ Folder for saving data """
    create_dir("Jaccard_Unet")

    """ Hyperparameters """
    batch_size = 4
    lr = 1e-4             #learning rate(0.0001)
    num_epoch = 10        #no. of epochs
    model_path = "Jaccard_Unet/Jaccard_Unet_v1.h5"
    csv_path = "Jaccard_Unet/Jaccard_Unet.csv"

    """ Dataset : 60 train/20 validation/20 testing"""
    dataset_path = '/content/drive/MyDrive/ISIC2018'
    (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data(dataset_path)

    print(f"Train: {len(train_x)} - {len(train_y)}")
    print(f"Valid: {len(valid_x)} - {len(valid_y)}")
    print(f"Test: {len(test_x)} - {len(test_y)}")

    train_dataset = tf_dataset(train_x, train_y, batch_size)
    valid_dataset = tf_dataset(valid_x, valid_y, batch_size)
    test_dataset = tf_dataset(test_x, test_y, batch_size)
    train_steps = len(train_x)//batch_size
    valid_steps = len(valid_x)//batch_size
    test_steps = len(test_x)//batch_size

    #do not forget about the reamainder
    if len(train_x) % batch_size != 0:
        train_steps += 1

    if len(valid_x) % batch_size != 0:
        valid_steps += 1

    """ Build Model """
    model = build_unet((H, W, 3))
    model.compile(loss=[jaccard_coef_loss], optimizer=Adam(lr), metrics=[jaccard_coef])
    callbacks = [
        ModelCheckpoint(model_path, verbose=1, save_best_only=True),  #save the weight
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-7, verbose=1),
        CSVLogger(csv_path),
        TensorBoard(),
        EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=False)
    ]
    #model.summary()

    history_jacard = model.fit(
                    train_dataset,
                    epochs=num_epoch,
                    validation_data=valid_dataset,
                    steps_per_epoch=train_steps,
                    validation_steps=valid_steps,
                    callbacks=callbacks
                )

所以,上面的代码是我训练过程的主要功能,10个epochs后,我想重新加载模型并再次训练它,因为我得到的jaccard分数并不理想。 但是当我用下面的代码重新加载它时:

with CustomObjectScope({'jaccard_coef_loss': jaccard_coef_loss, 'jaccard_coef': jaccard_coef}):
    new_model = tf.keras.models.load_model("Jaccard_Unet/Jaccard_Unet_v1.h5")

new_history_jcard = new_model.fit(
                    train_dataset,
                    epochs=num_epoch,
                    validation_data=valid_dataset,
                    steps_per_epoch=train_steps,
                    validation_steps=valid_steps,
                    callbacks=callbacks
                )

一些我无法理解的错误提示:

Epoch 1/10
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-190-605387750920> in <module>()
      5                     steps_per_epoch=train_steps,
      6                     validation_steps=valid_steps,
----> 7                     callbacks=callbacks
      8                 )

9 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    984           except Exception as e:  # pylint:disable=broad-except
    985             if hasattr(e, "ag_error_metadata"):
--> 986               raise e.ag_error_metadata.to_exception(e)
    987             else:
    988               raise

TypeError: in user code:

    /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:855 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:840 run_step  **
        with ops.control_dependencies(_minimum_control_deps(outputs)):
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:2888 _minimum_control_deps
        outputs = nest.flatten(outputs, expand_composites=True)
    /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py:416 flatten
        return _pywrap_utils.Flatten(structure, expand_composites)

    TypeError: '<' not supported between instances of 'function' and 'str'

谁能帮帮我,我是深度学习的新手。

0 个答案:

没有答案