我正在尝试将交叉验证应用于逻辑回归,该逻辑回归使用 sklearn 管道作为输入 tfidf 向量。我发现了几个以这种方式接近的示例,但我的代码不起作用。我收到错误“ValueError:发现样本数量不一致的输入变量:[1, 200]”。 如果我从管道中删除回归模型,tfidf 向量化器就可以正常工作。 这是我的代码:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
rows = [['1','buy, help, useful'], ['0','buy,bad, useful']]
data = pd.DataFrame(rows, columns = ['polarity', 'tokens'])
data = pd.concat([data]*150).sort_index()
tvec = TfidfVectorizer(preprocessor=lambda x: x,max_features=10000)
lr = LogisticRegression()
X = data.drop('polarity',axis = 1)
y = data.polarity
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
def lgr_cv(splits, X, y, pipeline, average_method):
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
for train_index, test_index in kfold.split(X,y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
lr_fit = pipeline.fit(X_train,y_train)
prediction = lr_fit.predict(X_test)
scores = lr_fit.score(X_test,y_test)
accuracy.append(scores * 100)
precision.append(precision_score(y_test, prediction, average=average_method)*100)
print(' negative neutral positive')
print('precision:',precision_score(y_test, prediction, average=None))
recall.append(recall_score(y_test, prediction, average=average_method)*100)
print('recall: ',recall_score(y_test, prediction, average=None))
f1.append(f1_score(y_test, prediction, average=average_method)*100)
print('f1 score: ',f1_score(y_test, prediction, average=None))
print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))
from sklearn.pipeline import Pipeline
original_pipeline = Pipeline([
('vectorizer', tvec),
('classifier', lr)
])
lgr_cv(3, X, y, original_pipeline, 'macro')
感谢任何帮助。