如何为Keras模型编写自定义dataGenetator

时间:2019-05-06 12:12:16

标签: python file-io keras generator

我目前的方法是这样的:首先,我以这种方式将图像输入内存:

def load_data_from_directory(root_dir, image_height, image_format = 'jpg', mask_format = 'png'):

    """    
    Loads train images and corresponding masks with specified image sizes.
    Masks should have same name as image.    
    Output files divided by 256 to be between 0-1.    
    Folder locations:    
        > images (Jpg format)    
        > segmentation   

    Example of usage:    
        from common_blocks.data_loaders import load_data_from_directory

        data_dir = './data_objects'    
        x_train, y_train = load_data_from_directory(data_dir, image_height = 256)           
    """

    data = []

    for stage in ['train']: #can be added 'test' stage    
        directory = os.path.join(root_dir,  'images')    
        file_names = [filename.replace(image_format , mask_format) for filename in os.listdir(directory)]

        fps = [os.path.join(directory, filename) for filename in os.listdir(directory)]
        for content in ['images', 'segmentation']:    
            # construct path to each image    
            directory = os.path.join(root_dir,  content)    

            if content != 'images':    
                fps = [os.path.join(directory, filename) for filename in file_names]
            # read images
            images = [imread(filepath)/255 for filepath in fps] 

            # if images have different sizes you have to resize them before:
            image = [resize(image, (image_height, image_height)) for image in images]
            # stack to one np.array 
            np_images = np.stack(image, axis=0)    
            data.append(np_images)    
    del image, file_names    
    gc.collect() 


    return data
x_train, y_train = load_data_from_directory_crans('./train', image_width, image_height,'jpg', 'png')

然后我将这些图像提供给DataGenerator:

class DataGenerator(Sequence):
'''
Sample usage:
test_generator = DataGenerator(x_train, y_train, 1, 
                           image_sizes, image_sizes, 1, True)
Xtest, ytest = test_generator.__getitem__(1)
plt.imshow(Xtest[0])
plt.show()
plt.imshow(ytest[0, :,:,0])
plt.show()
'''
def __init__(self, X, y, batch_size,  height,width, nb_y_features, augmentation = True):
    'Initialization'
    self.batch_size = batch_size
    self.X = X
    self.y = y
    self.indexes = None
    self.currentIndex = 0
    self.augmentation = augmentation
    self.on_epoch_end()
    self.height = height
    self.width = width
    self.nb_y_features = nb_y_features

def __len__(self):

    'Denotes the number of batches per epoch'
    return int(np.ceil(len(self.X) / self.batch_size))

def __getitem__(self, index):
    'Generate one batch of data'
    # Generate indexes of the batch
    data_index_min = int(index*self.batch_size)
    data_index_max = int(min((index+1)*self.batch_size, len(self.indexes)))
    indexes = self.indexes[data_index_min:data_index_max]

    this_batch_size = len(indexes) # The last batch can be smaller than the others

    X = np.empty((this_batch_size, self.width, self.height, 3)) #, dtype=int)
    y = np.empty((this_batch_size, self.width, self.height, self.nb_y_features), dtype=int)

    for i, sample_index in enumerate(indexes):
        data_index = self.indexes[index * self.batch_size + i]
        X_sample, y_sample = self.X[data_index].copy(), self.y[data_index].copy()
        if self.augmentation:
            augmented = aug()(image=X_sample, mask=y_sample)

            image_augm = augmented['image']
            mask_augm = augmented['mask']#.reshape(self.width, self.height, self.nb_y_features)
            X[i, ...] = image_augm
            y[i, ...] = mask_augm

        else:
            X[i, ...] = X_sample
            y[i, ...] = y_sample

    return X, y

def on_epoch_end(self):
    'Updates indexes after each epoch'
    self.indexes = list(range(len(self.X)))
    np.random.shuffle(self.indexes)

然后此生成器用于训练模型:

training_generator = DataGenerator(x_train, y_train, batch_size,  
                                  height = image_width, width = image_height, nb_y_features = 1, augmentation = True)
model = Unet(BACKBONE, encoder_weights='imagenet', encoder_freeze = False)
model.compile(optimizer = Adam(),
                    loss=bce_jaccard_loss, metrics=[iou_score])
history = model.fit_generator(training_generator, shuffle =True,
                  epochs=10)

问题在于数据大小。如果较小以适合内存-一切正常,一旦变大,它将失败,并显示内存不足错误。如何直接从文件夹中随机读取文件?

1 个答案:

答案 0 :(得分:0)

类似的事情应该起作用:

类DataGeneratorFolder(Sequence):     '''     样品用法

if to_debug:
test_generator = DataGeneratorFolder(image_names
                               , masks_names
                               , batch_size=2,
                               image_size=256, 
                               nb_y_features = 1, augmentation = True)
Xtest, ytest = test_generator.__getitem__(0)
plt.imshow(Xtest[0])     
plt.show()
plt.imshow(ytest[0, :,:,0])
plt.show()
'''
def __init__(self, image_filenames, mask_names, batch_size, 
             image_size=768, nb_y_features = 1, augmentation = True, 
             center_crop_prop = 0.5):
    self.image_filenames, self.mask_names = image_filenames, mask_names
    self.batch_size = batch_size
    self.currentIndex = 0
    self.augmentation = augmentation
    self.on_epoch_end()
    self.image_size = image_size
    self.nb_y_features = nb_y_features
    self.indexes = None
    self.center_crop_prop = center_crop_prop

def __len__(self):
    return np.ceil(len(self.image_filenames) / float(self.batch_size))

def on_epoch_end(self):
    'Updates indexes after each epoch'        
    self.image_filenames, self.mask_names = shuffle(self.image_filenames, self.mask_names)

def read_image_mask(self, image_name, mask_name):
    print(image_name, mask_name)
    return (imread(image_name)/255).astype(np.float32),\
           (imread(mask_name, as_gray = True) > 0).astype(np.int8)

def __getitem__(self, index):
    'Generate one batch of data'
    # Generate indexes of the batch
    data_index_min = int(index*self.batch_size)
    data_index_max = int(min((index+1)*self.batch_size, len(self.image_filenames)))

    indexes = self.image_filenames[data_index_min:data_index_max]
    this_batch_size = len(indexes) # The last batch can be smaller than the others
    X = np.empty((this_batch_size, self.image_size, self.image_size, 3), dtype=np.float32)
    y = np.empty((this_batch_size, self.image_size, self.image_size, self.nb_y_features), dtype=np.uint8)
    for i, sample_index in enumerate(indexes):
        X_sample, y_sample = self.read_image_mask(self.image_filenames[index * self.batch_size + i], 
                                                  self.mask_names[index * self.batch_size + i])

        random_crop_prop = 1
        if self.augmentation:
            if np.sum(y_sample) > 0: # mask is not null
                if np.random.choice( ['crop_with_object', 'crop_random'], 1,
                                    p=[self.center_crop_prop, 1-self.center_crop_prop]) == ['crop_with_object']:
                    X_sample, y_sample = random_crop_box_center(X_sample, y_sample, 
                                                                self.image_size, self.image_size)
                    random_crop_prop = 0

            augmented = aug_with_crop(self.image_size, random_crop_prop)(image=X_sample, mask=y_sample)
            image_augm = augmented['image']
            mask_augm = augmented['mask'].reshape(self.image_size, self.image_size, self.nb_y_features)

            X[i, ...] = image_augm
            y[i, ...] = mask_augm
        else:
            X[i, ...] = X_sample
            y[i, ...] = y_sample

    return X, y