使用 sklearn 管道对 tf-idf 向量进行 K 折逻辑回归

时间:2021-06-30 20:13:47

标签: python pipeline logistic-regression tf-idf k-fold

我正在尝试将交叉验证应用于逻辑回归,该逻辑回归使用 sklearn 管道作为输入 tfidf 向量。我发现了几个以这种方式接近的示例,但我的代码不起作用。我收到错误“ValueError:发现样本数量不一致的输入变量:[1, 200]”。 如果我从管道中删除回归模型,tfidf 向量化器就可以正常工作。 这是我的代码:

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

rows = [['1','buy, help, useful'], ['0','buy,bad, useful']]
data = pd.DataFrame(rows, columns = ['polarity', 'tokens'])

data = pd.concat([data]*150).sort_index()

tvec = TfidfVectorizer(preprocessor=lambda x: x,max_features=10000)
lr = LogisticRegression()

X = data.drop('polarity',axis = 1)
y = data.polarity

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

def lgr_cv(splits, X, y, pipeline, average_method):

kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
for train_index, test_index in kfold.split(X,y): 
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    lr_fit = pipeline.fit(X_train,y_train)
    prediction = lr_fit.predict(X_test)
    scores = lr_fit.score(X_test,y_test)
    
    accuracy.append(scores * 100)
    precision.append(precision_score(y_test, prediction, average=average_method)*100)
    print('              negative    neutral     positive')
    print('precision:',precision_score(y_test, prediction, average=None))
    recall.append(recall_score(y_test, prediction, average=average_method)*100)
    print('recall:   ',recall_score(y_test, prediction, average=None))
    f1.append(f1_score(y_test, prediction, average=average_method)*100)
    print('f1 score: ',f1_score(y_test, prediction, average=None))
    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))



from sklearn.pipeline import Pipeline

original_pipeline = Pipeline([
    ('vectorizer', tvec),
    ('classifier', lr)
])
    
lgr_cv(3, X, y, original_pipeline, 'macro')

感谢任何帮助。

0 个答案:

没有答案