这可能是一个远景,但这对我来说是一个反复出现的问题,但是我认为这应该是一个非常标准的情况,因此社区可能会从中受益。
我制作了一个从keras.utils.Sequence继承的DataGenerator类。
class DataGenerator(keras.utils.Sequence):
def __init__(self, options, shuffle, train):
"""
Parameters:
-----------
seed - CV seed
fold - CV fold
splits - total number of CV splits
"""
self.save_dir = options.save_dir
self.shuffle = shuffle
self.batch_size = options.batch_size
self.train = train
self.T = options.max_length
self.in_channel = options.in_feature
self.out_channel = options.mfcc
self.cats = options.dataset
self.autoencoder = options.autoencoder
self.inversion = options.inversion
# This is when all
ranges, train_idx, val_idx = get_indices(self.save_dir,
options.fold,
options.seed,
options.splits,
self.shuffle,
self.cats)
self.ranges = ranges
if self.train:
self.train_size = len(train_idx)
include = int(np.ceil(options.percentage*self.train_size))
self.list_IDs = train_idx[:include]
else:
self.list_IDs = val_idx
self.val_size = len(val_idx)
print(self.val_size)
self.on_epoch_end()
def on_epoch_end(self):
"""
Updates indexes after each epoch
"""
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __getitem__(self, index):
"""
Generate one batch of data
"""
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
if self.autoencoder:
X, Y = self.__data_generation(list_IDs_temp)
return X, Y
if self.inversion:
Y, X = self.__data_generation(list_IDs_temp)
return Y, X
else:
X, y, w = self.__data_generation(list_IDs_temp)
return X, y, w
def __len__(self):
"""
Denotes the number of batches per epoch
"""
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __data_generation(self, list_IDs_temp):
"""
Generates data containing batch_size samples
"""
# Initialization
X = np.zeros((self.batch_size, self.T, self.in_channel))
Y = np.zeros((self.batch_size, self.T, self.out_channel))
weights = np.ones((self.batch_size, self.T))
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
cat_id, in_id = get_category(ID,self.ranges,self.cats,self.save_dir)
Xtemp = np.load(self.save_dir +
str(cat_id) +
"/dataset_" +
str(in_id) + '.npy')
X[i,:,:] = Xtemp
# Store class - truncate the end if out_channel is less
Ytemp = np.load(self.save_dir +
str(cat_id) +
"/spset_" +
str(in_id) + '.npy')
Y[i,:,:] = Ytemp
f0 = np.load(self.save_dir +
str(cat_id) +
"/puref0set_" +
str(in_id) + '.npy')
T = get_duration(f0)
if self.autoencoder:
return Y,[X,Y]
if self.inversion:
if not np.any(np.isfinite(Y)):
print("problem")
if not np.any(np.isfinite(X)):
print("problem")
return Y,X
else:
return X,Y,weights
现在的问题是,在使用fit_generator馈入神经网络之前,我想绘制验证集。到目前为止,我这样做的标准方法是:
speech, ybunch = val_gen.__getitem__(0)
import matplotlib.pyplot as plt
for i in range(30):
plt.plot(speech[i,:,:], color="r")
plt.show()
但是有时(以及再次使用此代码),我得到 getitem (0)的结果很奇怪,这意味着我在批处理中得到了一个示例,而其他示例全为零。 / strong>大多数时候,我可以通过将其精确设置为批次大小来解决此问题(因为很多时候,批次大小不能被火车/ val设置大小整除)。
我对生成器缺乏深入的了解,但我确实相信这是标准用例,应该可以做到。