如何使用管道包装tfidfvectorizer和keras模型并保存模型以与Flask API一起使用

时间:2018-12-28 13:42:58

标签: python scikit-learn keras tfidfvectorizer mlp

我正在遵循this guide来使用Keras和我的自定义数据集训练MLP模型,

我已经成功地训练和保存了Keras模型,并且准确性很高, 但是,我想加载模型并用新的文本数据进行预测 但由于tfidfvectorizer矩阵不匹配,我无法这样做。

做完一些研究后,我知道我需要使用管道将所有这些包装起来,以将tfidfvectorizer功能和keras模型保存在一起。

请帮助,以下是我的代码段

NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2

def load_data(data_path, seed=123):

    imdb_data_path = os.path.join(data_path, 'data')

    # Load the training data
    train_texts = []
    train_labels = []
    test_texts = []
    test_labels = []
    texts = []
    labels = []
    for category in ['apparel', 'elearning','electronics','travel','web_hosting']:
        train_path = os.path.join(imdb_data_path, category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    texts.append(f.read())
                    if category == 'apparel':
                        labels.append(0)
                    elif category == 'elearning':
                          labels.append(1)
                    elif category == 'electronics':
                          labels.append(2)
                    elif category == 'travel':
                          labels.append(3)
                    else:
                          labels.append(4)


    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(texts)
    random.seed(seed)
    random.shuffle(labels)

    num_training_samples = int((1 - .25) * len(texts))
    return ((texts[:num_training_samples], labels[:num_training_samples]),
            (texts[num_training_samples:], labels[num_training_samples:]))

def ngram_vectorize(train_texts, train_labels, val_texts):

    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    print(x_train.shape[1])
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

def get_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation


def mlp_model(layers, units, dropout_rate, input_shape):
    op_units, op_activation = get_activation(5)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model




data = load_data('/home/rajapandey/crowler/new_exp/', seed=123)


(train_texts, train_labels), (val_texts, val_labels) = data


x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)

print(x_train.shape[1:])

model = mlp_model(2,64,0.2,x_train.shape[1:])
learning_rate=1e-3
loss = 'sparse_categorical_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

history = model.fit(
            x_train,
            train_labels,
            epochs=1000,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=128)

history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

#result = model.predict(x_val[9])
#print('Result '+str(result))
#model.save('imdb_mlp_model.h5')

0 个答案:

没有答案