我的电脑安装了gpu GeForce 940MX。它的内存带宽为16.02 GB / s。我正在尝试使用以下代码使用UNET模型训练LUNA数据集。
from __future__ import print_function
import numpy as np
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from keras.layers import concatenate
from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras import backend as K
K.set_image_dim_ordering('th') # Theano dimension ordering in this code
img_rows = 512
img_cols = 512
smooth = 1.
def dice_coef(y_true, y_pred):
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
def dice_coef_np(y_true,y_pred):
y_true_f = y_true.flatten()
y_pred_f = y_pred.flatten()
intersection = np.sum(y_true_f * y_pred_f)
return (2. * intersection + smooth) / (np.sum(y_true_f) + np.sum(y_pred_f) + smooth)
def dice_coef_loss(y_true, y_pred):
return -dice_coef(y_true, y_pred)
def get_unet():
inputs = Input((1,img_rows, img_cols))
conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)
conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)
#up6 = merge([UpSampling2D(size=(2, 2))(conv5), conv4], mode='concat', concat_axis=1)
up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), conv4], axis=1)
conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)
#up7 = merge([UpSampling2D(size=(2, 2))(conv6), conv3], mode='concat', concat_axis=1)
up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), conv3], axis=1)
conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)
#up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv2], mode='concat', concat_axis=1)
up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), conv2], axis=1)
conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)
#up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv1], mode='concat', concat_axis=1)
up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), conv1], axis=1)
conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)
conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9)
model = Model(inputs=inputs, outputs=conv10)
model.compile(optimizer=Adam(lr=1.0e-5), loss=dice_coef_loss, metrics=[dice_coef])
return model
def train_and_predict(use_existing):
print('-'*30)
print('Loading and preprocessing train data...')
print('-'*30)
imgs_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainImages.npy").astype(np.float32)
imgs_mask_train = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"trainMasks.npy").astype(np.float32)
imgs_test = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testImages.npy").astype(np.float32)
imgs_mask_test_true = np.load("C:/Users/hirplk/Desktop/unet/Luna2016-Lung-Nodule-Detection-master_new/DATA_PROCESS/scratch/cse/dual/cs5130287/Luna2016/output_final/"+"testMasks.npy").astype(np.float32)
mean = np.mean(imgs_train) # mean for data centering
std = np.std(imgs_train) # std for data normalization
imgs_train -= mean # images should already be standardized, but just in case
imgs_train /= std
print('-'*30)
print('Creating and compiling model...')
print('-'*30)
model = get_unet()
# Saving weights to unet.hdf5 at checkpoints
model_checkpoint = ModelCheckpoint('unet.hdf5', monitor='loss', save_best_only=True)
#
# Should we load existing weights?
# Set argument for call to train_and_predict to true at end of script
if use_existing:
model.load_weights('./unet.hdf5')
#
# The final results for this tutorial were produced using a multi-GPU
# machine using TitanX's.
# For a home GPU computation benchmark, on my home set up with a GTX970
# I was able to run 20 epochs with a training set size of 320 and
# batch size of 2 in about an hour. I started getting reseasonable masks
# after about 3 hours of training.
#
print('-'*30)
print('Fitting model...')
print('-'*30)
model.fit(imgs_train, imgs_mask_train, batch_size=50, epochs=10, verbose=1, shuffle=True,
callbacks=[model_checkpoint])
# loading best weights from training session
print('-'*30)
print('Loading saved weights...')
print('-'*30)
model.load_weights('./unet.hdf5')
print('-'*30)
print('Predicting masks on test data...')
print('-'*30)
num_test = len(imgs_test)
imgs_mask_test = np.ndarray([num_test,1,512,512],dtype=np.float32)
for i in range(num_test):
imgs_mask_test[i] = model.predict([imgs_test[i:i+1]], verbose=0)[0]
np.save('masksTestPredicted.npy', imgs_mask_test)
mean = 0.0
for i in range(num_test):
mean+=dice_coef_np(imgs_mask_test_true[i,0], imgs_mask_test[i,0])
mean/=num_test
print("Mean Dice Coeff : ",mean)
if __name__ == '__main__':
train_and_predict(False)
但是当使用GPU运行它时我会收到以下错误。
Warning (from warnings module):
File "C:\Research\Python_installation\lib\site-packages\h5py\__init__.py", line 36
from ._conv import register_converters as _register_converters
FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
Using TensorFlow backend.
------------------------------
Loading and preprocessing train data...
------------------------------
------------------------------
Creating and compiling model...
------------------------------
------------------------------
Fitting model...
------------------------------
Epoch 1/10
Traceback (most recent call last):
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1327, in _do_call
return fn(*args)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1306, in _run_fn
status, run_metadata)
File "C:\Research\Python_installation\lib\contextlib.py", line 66, in __exit__
next(self.gen)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
[[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
[[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
train_and_predict(False)
File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 127, in train_and_predict
callbacks=[model_checkpoint])
File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1657, in fit
validation_steps=validation_steps)
File "C:\Research\Python_installation\lib\site-packages\keras\engine\training.py", line 1213, in _fit_loop
outs = f(ins_batch)
File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 2357, in __call__
**self.session_kwargs)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 895, in run
run_metadata_ptr)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1124, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1321, in _do_run
options, run_metadata)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[50,32,512,512]
[[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
[[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op 'conv2d_1/convolution', defined at:
File "<string>", line 1, in <module>
File "C:\Research\Python_installation\lib\idlelib\run.py", line 124, in main
ret = method(*args, **kwargs)
File "C:\Research\Python_installation\lib\idlelib\run.py", line 351, in runcode
exec(code, self.locals)
File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 150, in <module>
train_and_predict(False)
File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 106, in train_and_predict
model = get_unet()
File "C:\Users\hirplk\Desktop\unet\DSB3Tutorial-master\tutorial_code\LUNA_train_unet.py", line 39, in get_unet
conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
File "C:\Research\Python_installation\lib\site-packages\keras\engine\topology.py", line 603, in __call__
output = self.call(inputs, **kwargs)
File "C:\Research\Python_installation\lib\site-packages\keras\layers\convolutional.py", line 164, in call
dilation_rate=self.dilation_rate)
File "C:\Research\Python_installation\lib\site-packages\keras\backend\tensorflow_backend.py", line 3195, in conv2d
data_format=tf_data_format)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 672, in convolution
op=op)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 338, in with_space_to_batch
return op(input, num_spatial_dims, padding)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 664, in op
name=name)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 131, in _non_atrous_convolution
name=name)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 397, in conv2d
data_format=data_format, name=name)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
op_def=op_def)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\Research\Python_installation\lib\site-packages\tensorflow\python\framework\ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[50,32,512,512]
[[Node: conv2d_1/convolution = Conv2D[T=DT_FLOAT, data_format="NCHW", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_261, conv2d_1/kernel/read)]]
[[Node: loss/mul/_273 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_3022_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
有人可以请你解释一下这个错误背后的原因,ResourceExhaustedError。是因为GPU的内存不足以加载数据集。 没有GPU,这很好用。但花了大约6个小时来完成一个时代