Sagemaker AWS中的自动超参数调整无法运行

时间:2020-07-06 11:11:13

标签: amazon-web-services amazon-s3 amazon-ec2 deep-learning amazon-sagemaker

我正在使用SageMaker训练我的模型。为了获得更好的结果,我尝试运行自动超参数调整不使用此方法的培训运行得很好并提供所需的结果,但是一旦我尝试使用此方法运行它,它就会给我和类似于培训作业中以下错误的错误(num_filters和learning_rate更改):

algorithmerror: ExecuteUserScriptError: Command "/usr/bin/python3 script_unet.py --batch_size 54 --learning_rate 0.0002596573898074083
--model_dir s3://sagemaker-us-east-2-6713267672/tensorflow-training-2020-07-04-10-02-56-198/model/tensorflow-training-200704-1002-002-b7291d39/model --num_filters 46"

我已经尝试了许多其他批处理大小,以确保它不是内存问题,并且始终产生相同的错误,所以我想不是。 我需要一个h5模型扩展以在外部使用它,这就是为什么我要使用该保存行到名为models-pfe的存储桶中。

我正在使用的模型脚本如下:

#Dependencies:
import argparse, os
import numpy as np

import tensorflow as tf
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.utils import multi_gpu_model
import boto3
from botocore.exceptions import NoCredentialsError

print("All the dependencies imported")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--epochs', type=int, default=60)
    parser.add_argument('--num_filters', type=int, default=32)
    parser.add_argument('--learning_rate', type=float, default=0.0001)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--model_dir', type=str, default='s3://model-pfe')
    parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--testing', type=str, default=os.environ['SM_CHANNEL_TESTING'])
    parser.add_argument('--access_key', type=str)
    parser.add_argument('--secret_key', type=str)



    args, _ = parser.parse_known_args()

    epochs       = args.epochs
    num_filters  = args.num_filters
    lr           = args.learning_rate
    batch_size   = args.batch_size
    model_dir    = args.model_dir
    training_dir = args.training
    testing_dir  = args.testing
    access_key  = args.access_key
    secret_key  = args.secret_key



    X_train = np.load(os.path.join(training_dir, 'training.npz'))['image']
    Y_train = np.load(os.path.join(training_dir, 'training.npz'))['label']
    X_test  = np.load(os.path.join(testing_dir, 'testing.npz'))['image']
    Y_test  = np.load(os.path.join(testing_dir, 'testing.npz'))['label']

    # input image dimensions
    img_rows, img_cols = 512,512

    # Tensorflow needs image channels last, e.g. (batch size, width, height, channels)
    K.set_image_data_format('channels_last')
    print(K.image_data_format())



    print('X_train shape:', X_train.shape)
    print(X_train.shape[0], 'train samples')
    print(X_test.shape[0], 'test samples')

    # Normalize pixel values
    X_train   = X_train.astype('float32')
    X_test    = X_test.astype('float32')
    X_train  /= 255
    X_test   /= 255


    # U-Net model
    inputs = Input((512, 512, 3))
    c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (inputs)
    c1 = Dropout(0.1) (c1)
    c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c1)
    p1 = MaxPooling2D((2, 2)) (c1)

    c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p1)
    c2 = Dropout(0.1) (c2)
    c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c2)
    p2 = MaxPooling2D((2, 2)) (c2)

    c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p2)
    c3 = Dropout(0.2) (c3)
    c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c3)
    p3 = MaxPooling2D((2, 2)) (c3)

    c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p3)
    c4 = Dropout(0.2) (c4)
    c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c4)
    p4 = MaxPooling2D(pool_size=(2, 2)) (c4)

    c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p4)
    c5 = Dropout(0.3) (c5)
    c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c5)

    u6 = Conv2DTranspose(num_filters*8, (2, 2), strides=(2, 2), padding='same') (c5)
    u6 = concatenate([u6, c4])
    c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u6)
    c6 = Dropout(0.2) (c6)
    c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c6)

    u7 = Conv2DTranspose(num_filters*4, (2, 2), strides=(2, 2), padding='same') (c6)
    u7 = concatenate([u7, c3])
    c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u7)
    c7 = Dropout(0.2) (c7)
    c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c7)

    u8 = Conv2DTranspose(num_filters*2, (2, 2), strides=(2, 2), padding='same') (c7)
    u8 = concatenate([u8, c2])
    c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u8)
    c8 = Dropout(0.1) (c8)
    c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c8)

    u9 = Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same') (c8)
    u9 = concatenate([u9, c1], axis=3)
    c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u9)
    c9 = Dropout(0.1) (c9)
    c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c9)

    outputs = Conv2D(1, (1, 1), activation='sigmoid') (c9)
    model = Model(inputs=[inputs], outputs=[outputs])
    print(model.summary())

    # Use GPUs (for ml.p2.8xlarge = 8 GPUs)
    model = multi_gpu_model(model, gpus=8)

    model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])
    
    #Fit model
    results = model.fit(X_train, Y_train,
                        validation_data=(X_test, Y_test),
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        shuffle=True)

    
    # Validation evaluation
    score= model.evaluate(X_test, Y_test)
    print('Validation loss    :', score[0])
    print('Validation accuracy:', score[1])

    s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
    
    def upload_to_aws(local_file, bucket, s3_file):
        try:
            s3.upload_file(local_file, bucket, s3_file)
            print("Upload Successful")
            return True
        except FileNotFoundError:
            print("The file was not found")
            return False
        except NoCredentialsError:
            print("Credentials not available")
            return False
    model.save('model.h5')
    upload_to_aws('model.h5','models-pfe',"model.h5")

,并使用以下脚本在自动超参数调整上运行此脚本:

import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

#My data location in s3
training_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/training/training.npz"
validation_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/validation/testing.npz"


from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(entry_point='script_unet.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.p2.8xlarge',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 60,
                              'batch_size': 32, 
                              'access_key'   : '',
                              'secret_key'   : ''}
                         )
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'num_filters'  : IntegerParameter(32,64),
    'learning_rate': ContinuousParameter(0.0001, 0.005)}

objective_metric_name = 'loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'loss','Regex': 'loss = ([0-9\\.]+)'}]

tuner = HyperparameterTuner(tf_estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=6,
                            max_parallel_jobs=1,
                            objective_type=objective_type,
                            early_stopping_type='Auto')

tuner.fit({'training': training_input_path, 'validation': validation_input_path})

出于安全考虑,我已更改了存储桶名称,秘密密钥和访问密钥

0 个答案:

没有答案