我正在遵循this guide来使用Keras和我的自定义数据集训练MLP模型,
我已经成功地训练和保存了Keras模型,并且准确性很高,
但是,我想加载模型并用新的文本数据进行预测
但由于tfidfvectorizer
矩阵不匹配,我无法这样做。
做完一些研究后,我知道我需要使用管道将所有这些包装起来,以将tfidfvectorizer
功能和keras
模型保存在一起。
请帮助,以下是我的代码段
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2
def load_data(data_path, seed=123):
imdb_data_path = os.path.join(data_path, 'data')
# Load the training data
train_texts = []
train_labels = []
test_texts = []
test_labels = []
texts = []
labels = []
for category in ['apparel', 'elearning','electronics','travel','web_hosting']:
train_path = os.path.join(imdb_data_path, category)
for fname in sorted(os.listdir(train_path)):
if fname.endswith('.txt'):
with open(os.path.join(train_path, fname)) as f:
texts.append(f.read())
if category == 'apparel':
labels.append(0)
elif category == 'elearning':
labels.append(1)
elif category == 'electronics':
labels.append(2)
elif category == 'travel':
labels.append(3)
else:
labels.append(4)
# Shuffle the training data and labels.
random.seed(seed)
random.shuffle(texts)
random.seed(seed)
random.shuffle(labels)
num_training_samples = int((1 - .25) * len(texts))
return ((texts[:num_training_samples], labels[:num_training_samples]),
(texts[num_training_samples:], labels[num_training_samples:]))
def ngram_vectorize(train_texts, train_labels, val_texts):
kwargs = {
'ngram_range': NGRAM_RANGE, # Use 1-grams + 2-grams.
'dtype': 'int32',
'strip_accents': 'unicode',
'decode_error': 'replace',
'analyzer': TOKEN_MODE, # Split text into word tokens.
'min_df': MIN_DOCUMENT_FREQUENCY,
}
vectorizer = TfidfVectorizer(**kwargs)
# Learn vocabulary from training texts and vectorize training texts.
x_train = vectorizer.fit_transform(train_texts)
# Vectorize validation texts.
x_val = vectorizer.transform(val_texts)
# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
print(x_train.shape[1])
selector.fit(x_train, train_labels)
x_train = selector.transform(x_train).astype('float32')
x_val = selector.transform(x_val).astype('float32')
return x_train, x_val
def get_activation(num_classes):
"""Gets the # units and activation function for the last network layer.
# Arguments
num_classes: int, number of classes.
# Returns
units, activation values.
"""
if num_classes == 2:
activation = 'sigmoid'
units = 1
else:
activation = 'softmax'
units = num_classes
return units, activation
def mlp_model(layers, units, dropout_rate, input_shape):
op_units, op_activation = get_activation(5)
model = models.Sequential()
model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
for _ in range(layers-1):
model.add(Dense(units=units, activation='relu'))
model.add(Dropout(rate=dropout_rate))
model.add(Dense(units=op_units, activation=op_activation))
return model
data = load_data('/home/rajapandey/crowler/new_exp/', seed=123)
(train_texts, train_labels), (val_texts, val_labels) = data
x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)
print(x_train.shape[1:])
model = mlp_model(2,64,0.2,x_train.shape[1:])
learning_rate=1e-3
loss = 'sparse_categorical_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]
history = model.fit(
x_train,
train_labels,
epochs=1000,
callbacks=callbacks,
validation_data=(x_val, val_labels),
verbose=2, # Logs once per epoch.
batch_size=128)
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
#result = model.predict(x_val[9])
#print('Result '+str(result))
#model.save('imdb_mlp_model.h5')