我有一个大型的n_samples数据集,n_features,n_classes = 346679,10233,86。我正在尝试在这个数据集上构建一个分类器。为此,我使用的是使用keras序列模型构建的多层感知器。
DataGeneratorClass
class DataGeneratorKeras:
def __init__(self, num_rows, n_classes, n_samples, n_features, batch_size=1, shuffle=True):
self.num_rows = num_rows
self.n_samples = n_samples
self.n_features = n_features
self.n_classes = n_classes
self.batch_size = batch_size
self.shuffle = shuffle
self.flag = False
def __get_exploration_order(self, list_ids):
"""
Generates order of exploration
:param list_ids:
:return:
"""
# Find exploration order
indexes = np.arange(len(list_ids))
if self.shuffle:
np.random.shuffle(indexes)
return indexes
def __data_generation(self, list_ids_temp, n_classes):
"""
Generates data of batch_size samples
:param list_ids_temp:
:param n_classes:
:return:
"""
index = list_ids_temp[0]
fv = load_npz("data_file_" + str(index) + ".npz")
labels_complete = load(...) # Load labels
partial_labels = labels_complete[index]
del labels_complete
y = self.sparsify(partial_labels, n_classes)
return fv, y
@staticmethod
def sparsify(y, n_classes):
"""
:return:
"""
label_encoder = np_utils.to_categorical(y, n_classes)
return label_encoder
def generate(self, list_ids):
"""
Generates batches of samples
:param list_ids:
:return:
"""
# Infinite loop
while 1:
# Generate order of exploration of dataset
indexes = self.__get_exploration_order(list_ids)
# Generate batches
imax = int(len(indexes) / self.batch_size)
for i in range(imax):
# Find list of IDs
list_ids_temp = [list_ids[k] for k in indexes[i * self.batch_size:(i + 1) * self.batch_size]]
# Generate data
x, y = self.__data_generation(list_ids_temp, self.n_classes)
yield x.toarray(), y
脚本类
class Script:
def __init__(self, num_rows, batch_size, test_size, n_classes, n_samples, n_features):
self.batch_size = batch_size
self.num_rows = num_rows
self.test_size = test_size
self.n_classes = n_classes
self.n_samples = n_samples
self.n_features = n_features
def main(self):
validation = int(self.test_size * self.num_rows)
train = self.num_rows - validation
params = {
'num_rows': self.num_rows,
'n_samples': self.n_samples,
'n_features': self.n_features,
'n_classes': self.n_classes,
'batch_size': self.batch_size,
'shuffle': True
}
partition = {'train': range(train), 'validation': range(train, self.num_rows)}
# Generators
training_generator = DataGeneratorKeras(**params).generate(partition['train'])
validation_generator = DataGeneratorKeras(**params).generate(partition['validation'])
return training_generator, validation_generator, partition
if __name__ == "__main__":
script = Script(num_rows=347, test_size=0.25, n_classes=86, n_samples=346679, n_features=10233, batch_size=1)
training_generator, validation_generator, partition = script.main()
构建模型
def classifier_base_data(dropout, learning_rate):
model = Sequential()
model.add(Dense(2**13, input_shape=(script.n_features,), activation='relu', name="l_input"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**12, input_dim=2**13, activation='relu', name="l_hidden_1"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**11, input_dim=2**12, activation='relu', name="l_hidden_2"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**10, input_dim=2**11, activation='relu', name="l_hidden_3"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**9, input_dim=2**10, activation='relu', name="l_hidden_4"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**8, input_dim=2**9, activation='relu', name="l_hidden_5"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(2**7, input_dim=2**8, activation='relu', name="l_hidden_6"))
model.add(BatchNormalization())
model.add(Dropout(dropout))
model.add(Dense(script.n_classes, activation='softmax', name="l_output"))
optimizer = adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print model.summary()
return model
当我使用keras拟合函数运行模型时,我能够达到val_acc和acc 25%。
history = model.fit(x_train.toarray(), y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_validation.toarray(), y_validation))
由于数据很大,我正在使用keras的DataGenerator,遵循一篇写得很好的教程keras-datagen-tutorial。当我使用fit_generator运行模型时,我得到 0% val_acc。
model.fit_generator(
generator = training_generator,
steps_per_epoch = len(partition['train']),
epochs = epochs,
validation_data = validation_generator,
validation_steps = len(partition['validation']),
verbose = 1
)
DataGenerator中是否有任何问题?