Keras 自定义数据集,以图像作为标签/地面实况

时间:2021-03-02 09:55:39

标签: image tensorflow keras dataset training-data

我正在制作图像去噪模型并使用 ImageDataGenerator.flow_from_directory 加载数据集。它由两个文件夹构成,一个包含嘈杂的输入图像,另一个包含相应的干净图像。我希望生成器将第一个文件夹中的图像用作输入,将另一个文件夹中的图像用作“标签”/ground truth。

使用我现在使用的方法,两个文件夹中的所有图像都被视为输入,文件夹名称作为标签。我可以通过选择特定批次来手动提取图像并对其进行训练,但这很不方便,而且可能不打算以这种方式使用。

这样做的正确方法是什么?可能有此功能,但我找不到。

1 个答案:

答案 0 :(得分:0)

有类似的问题。发现有必要创建一个自定义生成器来将图像输入到 model.fit 中。代码(相当长)发布在下面。

import os
import pandas as pd
import numpy as np
import glob
import cv2
from sklearn.model_selection import train_test_split

def create_df(image_dir, label_dir, shuffle=True):   
    path=image_dir + '/**/*'     
    image_file_paths=glob.glob(path,recursive=True)    
    path=label_dir + '/**/*'    
    label_file_paths=glob.glob(path,recursive=True)   
    # run a check and make sure filename without extensions match
    df=pd.DataFrame({'image': image_file_paths, 'label':label_file_paths}).astype(str)
    if shuffle: 
        df=df.sample(frac=1.0, replace=False, weights=None, random_state=123, axis=0).reset_index(drop=True)     
    return df

class jpgen():
    batch_index=0  #tracks the number of batches generated  
    def __init__(self, df,   train_split=None, test_split=None):         
        self.train_split=train_split  # float between 0 and 1 indicating the percentage of images to use for training
        self.test_split=test_split        
        self.df=df.copy() # create a copy of the data frame
        if self.train_split != None: # split the df to create a training df
            self.train_df, dummy_df=train_test_split(self.df, train_size=self.train_split, shuffle=False)
            if self.test_split !=None: # create as test set and a validation set
                t_split=self.test_split/(1.0-self.train_split)
                self.test_df, self.valid_df=train_test_split(dummy_df, train_size=t_split, shuffle=False)
                self.valid_gen_len=len(self.valid_df['image'].unique())# create var to return no of samples in valid generator
                self.valid_gen_filenames=list(self.valid_df['image'])# create list ofjpg file names in valid generator
            else: self.test_df=dummy_df
            self.test_gen_len=len(self.test_df['image'].unique())#create var to return no of test samples
            self.test_gen_filenames=list(self.test_df['image']) # create list to return jpg file paths in test_gen
        else:
            self.train_df=self.df  
        self.tr_gen_len=len(self.train_df['image'].unique())  # crete variable to return no of samples in train generator
    
    def flow(self,  batch_size=32, image_shape=None,rescale=None,shuffle=True, subset=None ): 
        # flows batches of jpg images and png masks to model.fit
        self.batch_size=batch_size
        self.image_shape=image_shape        
        self.shuffle=shuffle 
        self.subset=subset
        self.rescale=rescale   
        image_batch_list=[] # initialize list to hold a batch of jpg  images
        label_batch_list=[] # initialize list to hold batches of png masks 
        if self.subset=='training' or self.train_split ==None:
            op_df=self.train_df
        elif self.subset=='test':
            op_df=self.test_df
        else:
            op_df=self.valid_df
        if self.shuffle : # shuffle  the op_df then rest the index            
            op_df=op_df.sample(frac=1.0, replace=False, weights=None, random_state=123, axis=0).reset_index(drop=True) 
        #op_df will be either train, test or valid depending on subset
        # develop the batch of data
        while True:
            label_batch_list=[]
            image_batch_list=[]
            start=jpgen.batch_index * self.batch_size # set start value of iteration        
            end=start + self.batch_size   # set end value of iteration to yield 1 batch of data of length batch_size
            sample_count=len(op_df['image'])            
            for i in range(start, end): # iterate over one batch size of data
                j=i % sample_count # used to roll the images  back to the front if the end is reached
                k=j % self.batch_size                
                path_to_image= op_df.iloc[j]['image']
                path_to_label= op_df.iloc[j] ['label']                
                label_image=cv2.imread(path_to_label, -1) # read unchanged to preserve 4 th channel                print (png_image.)
                label_image= cv2.cvtColor(label_image, cv2.COLOR_BGR2RGB)                
                image_image=cv2.imread(path_to_image)
                image_image= cv2.cvtColor(image_image, cv2.COLOR_BGR2RGB)
                label_image=cv2.resize(label_image, self.image_shape)                
                image_image=cv2.resize(image_image, self.image_shape )
                if rescale !=None:
                    label_image=label_image * self.rescale
                    image_image=image_image * self.rescale            
                label_batch_list.append(label_image)
                image_batch_list.append(image_image)
            image_array=np.array(image_batch_list) 
            label_array=np.array(label_batch_list)            
            jpgen.batch_index +=1            
            yield (image_array, label_array)

下面的代码展示了如何使用上面的函数为model.fit制作生成器

image_dir=r'C:\Temp\gen_test\images'# directory with clean images
label_dir=r'C:\Temp\gen_test\labels' # directory with noisy images file names same as filenames in clean dir
shuffle=False # if True shuffles the dataframe
df=create_df(image_dir, label_dir ,shuffle) # create a dataframe with columns 'images' , 'labels'
                                            # where labels are the noisy images
train_split=.8 # use 80% of files for training
test_split=.1  # use 10% for test, automatically sets validation split at 1-train_split-test_split
batch_size=32 # set batch_size
height=224 # set image height for generator output images and labels
width=224 # set image width for generator output images and labels
channels=3 # set number of channel in images
image_shape=(height, width) 
rescale=1/255  # set value to rescale image pixels
gen=jpgen(df, train_split=train_split, test_split=test_split) # create instance of generator class
tr_gen_len=gen.tr_gen_len
test_gen_len= gen.test_gen_len
valid_gen_len=gen.valid_gen_len   
test_filenames=gen.test_gen_filenames # names of test file paths used for training 
train_steps=tr_gen_len//batch_size #  use this value in for steps_per_epoch in model.fit
valid_steps=valid_gen_len//batch_size # use this value for validation_steps in model.fit
test_steps=test_gen_len//batch_size  # use this value for steps in model.predict
# instantiate generators
train_gen=gen.flow(batch_size=batch_size, image_shape=image_shape, rescale=rescale, shuffle=False, subset='training')
valid_gen=gen.flow(batch_size=batch_size, image_shape=image_shape, rescale=rescale, shuffle=False, subset='valid')
test_gen=gen.flow(batch_size=batch_size, image_shape=image_shape, rescale=rescale, shuffle=False, subset='test')

构建模型然后使用

history=model.fit(train_gen, epochs=epochs, steps_per_epoch=train_steps,validation_data=valid_gen, 
                  validation_steps=valid_steps, verbose=1, shuffle=True)

predictions=model.predict(test_gen, steps=test_steps)