我正在使用SageMaker训练我的模型。为了获得更好的结果,我尝试运行自动超参数调整。 不使用此方法的培训运行得很好并提供所需的结果,但是一旦我尝试使用此方法运行它,它就会给我和类似于培训作业中以下错误的错误(num_filters和learning_rate更改):
algorithmerror: ExecuteUserScriptError: Command "/usr/bin/python3 script_unet.py --batch_size 54 --learning_rate 0.0002596573898074083
--model_dir s3://sagemaker-us-east-2-6713267672/tensorflow-training-2020-07-04-10-02-56-198/model/tensorflow-training-200704-1002-002-b7291d39/model --num_filters 46"
我已经尝试了许多其他批处理大小,以确保它不是内存问题,并且始终产生相同的错误,所以我想不是。 我需要一个h5模型扩展以在外部使用它,这就是为什么我要使用该保存行到名为models-pfe的存储桶中。
我正在使用的模型脚本如下:
#Dependencies:
import argparse, os
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.utils import multi_gpu_model
import boto3
from botocore.exceptions import NoCredentialsError
print("All the dependencies imported")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=60)
parser.add_argument('--num_filters', type=int, default=32)
parser.add_argument('--learning_rate', type=float, default=0.0001)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--model_dir', type=str, default='s3://model-pfe')
parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--testing', type=str, default=os.environ['SM_CHANNEL_TESTING'])
parser.add_argument('--access_key', type=str)
parser.add_argument('--secret_key', type=str)
args, _ = parser.parse_known_args()
epochs = args.epochs
num_filters = args.num_filters
lr = args.learning_rate
batch_size = args.batch_size
model_dir = args.model_dir
training_dir = args.training
testing_dir = args.testing
access_key = args.access_key
secret_key = args.secret_key
X_train = np.load(os.path.join(training_dir, 'training.npz'))['image']
Y_train = np.load(os.path.join(training_dir, 'training.npz'))['label']
X_test = np.load(os.path.join(testing_dir, 'testing.npz'))['image']
Y_test = np.load(os.path.join(testing_dir, 'testing.npz'))['label']
# input image dimensions
img_rows, img_cols = 512,512
# Tensorflow needs image channels last, e.g. (batch size, width, height, channels)
K.set_image_data_format('channels_last')
print(K.image_data_format())
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# Normalize pixel values
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# U-Net model
inputs = Input((512, 512, 3))
c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (inputs)
c1 = Dropout(0.1) (c1)
c1 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c1)
p1 = MaxPooling2D((2, 2)) (c1)
c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p1)
c2 = Dropout(0.1) (c2)
c2 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c2)
p2 = MaxPooling2D((2, 2)) (c2)
c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p2)
c3 = Dropout(0.2) (c3)
c3 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c3)
p3 = MaxPooling2D((2, 2)) (c3)
c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p3)
c4 = Dropout(0.2) (c4)
c4 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c4)
p4 = MaxPooling2D(pool_size=(2, 2)) (c4)
c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (p4)
c5 = Dropout(0.3) (c5)
c5 = Conv2D(num_filters*16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c5)
u6 = Conv2DTranspose(num_filters*8, (2, 2), strides=(2, 2), padding='same') (c5)
u6 = concatenate([u6, c4])
c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u6)
c6 = Dropout(0.2) (c6)
c6 = Conv2D(num_filters*8, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c6)
u7 = Conv2DTranspose(num_filters*4, (2, 2), strides=(2, 2), padding='same') (c6)
u7 = concatenate([u7, c3])
c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u7)
c7 = Dropout(0.2) (c7)
c7 = Conv2D(num_filters*4, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c7)
u8 = Conv2DTranspose(num_filters*2, (2, 2), strides=(2, 2), padding='same') (c7)
u8 = concatenate([u8, c2])
c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u8)
c8 = Dropout(0.1) (c8)
c8 = Conv2D(num_filters*2, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c8)
u9 = Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same') (c8)
u9 = concatenate([u9, c1], axis=3)
c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (u9)
c9 = Dropout(0.1) (c9)
c9 = Conv2D(num_filters, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same') (c9)
outputs = Conv2D(1, (1, 1), activation='sigmoid') (c9)
model = Model(inputs=[inputs], outputs=[outputs])
print(model.summary())
# Use GPUs (for ml.p2.8xlarge = 8 GPUs)
model = multi_gpu_model(model, gpus=8)
model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])
#Fit model
results = model.fit(X_train, Y_train,
validation_data=(X_test, Y_test),
batch_size=batch_size,
epochs=epochs,
verbose=1,
shuffle=True)
# Validation evaluation
score= model.evaluate(X_test, Y_test)
print('Validation loss :', score[0])
print('Validation accuracy:', score[1])
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
def upload_to_aws(local_file, bucket, s3_file):
try:
s3.upload_file(local_file, bucket, s3_file)
print("Upload Successful")
return True
except FileNotFoundError:
print("The file was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False
model.save('model.h5')
upload_to_aws('model.h5','models-pfe',"model.h5")
,并使用以下脚本在自动超参数调整上运行此脚本:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
#My data location in s3
training_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/training/training.npz"
validation_input_path="s3://sagemaker-us-east-2-6713267672/pfe-unet/validation/testing.npz"
from sagemaker.tensorflow import TensorFlow
tf_estimator = TensorFlow(entry_point='script_unet.py',
role=role,
train_instance_count=1,
train_instance_type='ml.p2.8xlarge',
framework_version='1.12',
py_version='py3',
script_mode=True,
hyperparameters={
'epochs': 60,
'batch_size': 32,
'access_key' : '',
'secret_key' : ''}
)
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {
'num_filters' : IntegerParameter(32,64),
'learning_rate': ContinuousParameter(0.0001, 0.005)}
objective_metric_name = 'loss'
objective_type = 'Minimize'
metric_definitions = [{'Name': 'loss','Regex': 'loss = ([0-9\\.]+)'}]
tuner = HyperparameterTuner(tf_estimator,
objective_metric_name,
hyperparameter_ranges,
metric_definitions,
max_jobs=6,
max_parallel_jobs=1,
objective_type=objective_type,
early_stopping_type='Auto')
tuner.fit({'training': training_input_path, 'validation': validation_input_path})
出于安全考虑,我已更改了存储桶名称,秘密密钥和访问密钥