Question

我有一组聚集的文件。现在每个文档都有一个标签。我想基于此构建一个分类器，训练并测试它，以便它可以正常工作，如果我提供一个新的文档/文本，它就属于一个合适的集群。所以我使用countVectorizer将文档转换为功能。我知道这个countVectorizer会在我提供的文档集（超过1000个doc）中获取所有单词的唯一集合。现在我创建一个分类器KNN或NavieBayes，现在我有一个新的文本文件或文档，我需要将其转换为功能。但是，如果我给countVectorizer一个documnet我将只有几个单词，并且基于此，整个功能将不同于训练和测试文档，这肯定会给出一个结果。我如何使用相同的countVectorizer对象，我给出的文件有任何方法。请指导我，任何建议或方法这样做???

    def classifierNaviaBayes(self):
    count_vectorizer = CountVectorizer(binary='true')
    train_documents = count_vectorizer.fit_transform(self.training_documents)
    classifier = BernoulliNB().fit(train_documents, self.training_labels)

    "Test Phase"
    count_worng_prediction = 0
    for i in range(0,len(self.test_documents)):
        print("The predicted value is ",classifier.predict(count_vectorizer.transform([self.test_documents[i]])))
        print("The expected value is ", self.test_labels[i])
        predicted_result = classifier.predict(count_vectorizer.transform([self.test_documents[i]]))[0]
        expected_result = self.test_labels[i]
        if predicted_result != expected_result:
            count_worng_prediction +=1

    print("The percentage of prediction accuracy is ",(100-(count_worng_prediction/len(self.test_documents))*100))

我也使用相同的countVertorizer作为测试数据，因此以下代码正在运行。

Answer 1

使用CountVectorizer.transform是对测试文档进行分类的正确方法。使用安装在训练数据上的矢量化器转换测试集时，将不会使用测试集中的新词汇表。（由于模型是在不同的词汇表上训练的，因此拟合矢量化器没有任何意义）

您可以详细了解如何适应稀疏功能here

Answer 2

在这里，我使用包含两列的数据：文本和审阅对文本进行分类。文本列包含句子/短语。审阅栏可以包含好，坏或中性。

TF-IDF特征向量已用于创建特征。

朴素贝叶斯，逻辑回归，随机森林，神经网络和LSTM用于构建分类器。

在这里，我展示了开发用于分类句子的各种算法的基本步骤。为了提高准确性，需要进行更多的参数调整。

该代码是使用Python语言和Jupyter Notebook开发的。

import keras
import sklearn
import xgboost
import textblob
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
import tensorflow.keras.preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import *
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
import  networkx
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from  keras import layers, models, optimizers
from keras.wrappers.scikit_learn import KerasClassifier

#Read data from a csv file. The file contains two columns, first column is the text column containing sentences and second column is the class or target
#The problem is to build classifiers that would learn the sentences and its corresponding class
#Then use the model to predict the class of a new test sentence(s)
doc = pd.read_csv("C:\\data.csv")
print("The head of the file looks as below:")
doc.head()

文件头如下：

                    text                                review
0   the laptop is good but it hangs                     bad
1   this tv is very fast in changing channels           good
2   the radio sound quality is same as the tv sound     neutral
3   i dont know the quality of this new radio           neutral
4   the laptop runs faster with 8 gb ram                good

#split the dataset into training and validation 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(doc['text'], doc['review'], train_size=.6, stratify=doc['review'])

#target column can have bad, good, and neutral. 
#label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# create tf-idf feature vector. Word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(doc['text'])
xtrain_tfidf_word =  tfidf_vect.transform(train_x)
xvalid_tfidf_word =  tfidf_vect.transform(valid_x)

#train various ML models
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)  
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    return metrics.accuracy_score(predictions, valid_y)


# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("LR, WordLevel TF-IDF: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("RF, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_word.tocsc(), train_y, xvalid_tfidf_word.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)


#encode the target string to integers using the scikitlearn class LabelEncoder. Then convert the vector of integers to a one hot encoding using the keras function to categorical
from sklearn.preprocessing import LabelEncoder
from keras import utils as np_utils
from keras.utils import to_categorical

#encode class value as integer
encoder_train = LabelEncoder()
encoder_train.fit(train_y)
encoded_train_target = encoder_train.transform(train_y)
dummy_target_train = np_utils.to_categorical(encoded_train_target)

encoder_valid = LabelEncoder()
encoder_valid.fit(valid_y)
encoded_valid_target = encoder_train.transform(valid_y)
dummy_target_valid = np_utils.to_categorical(encoded_valid_target)

cvec = CountVectorizer(stop_words='english')
xx = cvec.fit(doc['text'])
dummyall_x = pd.DataFrame(cvec.transform(doc['text']).todense(), columns=cvec.get_feature_names())

#Accuracy of NB, LR, RF, and Xgb models:
#NB, WordLevel TF-IDF:  0.675
#LR, WordLevel TF-IDF:  0.675
#RF, WordLevel TF-IDF:  0.575
#Xgb, WordLevel TF-IDF:  0.65

#Basic neural network

vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), lowercase=True, min_df=1, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(train_x)

def baseline_model():
    model = Sequential()
    model.add(Dense(units = 10, activation = 'relu', input_dim = len(vectorizer.get_feature_names())))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = baseline_model()
model.fit(X_train_onehot[:-3], dummy_target_train[:-3], epochs=2, batch_size=128, shuffle=True, validation_split = 0.1, verbose=1, validation_data=(X_train_onehot[-3:], dummy_target_valid[-3:]))
scores = model.evaluate(vectorizer.transform(valid_x), dummy_target_valid, verbose=1)
print("Accuracy:", scores[1])

#Accuracy of neural network model:   
#Accuracy: 0.25


#LSTM and CNN. 
#Sequence data have a 1 d spatial structure. Using CNN may help to pick out invariant features for the target. 
#The learned CNN spatial features are then Learned as sequences by the LSTM.


#MAX NO OF WORDS TO BE USED
MAX_NB_WORDS=50000
#max no of words in each complaint
MAX_SEQUENCE_LENGTH=250
EMBEDDING_DIM= 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(doc['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(doc['text'].values)
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(doc['review']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
#for CNN
model.add(tensorflow.keras.layers.SpatialDropout1D(0.2))
model.add(Conv1D(filters=100, kernel_size=10, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

#model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 2
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

#Accuracy of LSTM
#Accuracy: 0.400

如何使用分类器算法对单个文本进行分类

2 个答案: