我创建了一个用Python编码的支持向量机(SVM)。由于这是一种多标签分类一对多方法,因此我创建了一个SVM模型FOR EACH标签。现在,当我测试SVM模型时,我不明白为什么我的某些标签的f1分数低。原因之一可能是我的训练数据可能不如我想的那么准确。但这足以说明问题吗?请帮助我思考为什么会发生这种情况。我刚刚开始学习有关机器学习的知识,但我对此仍然很陌生。请帮助我通过我的论文。谢谢。
已附上Average f1 scores per label
的图片这是我的代码的某些部分:
# Connect to the database
train, test = train_test_split(df, random_state=42, train_size=0.80, test_size=0.20, shuffle=True)
train_text = train['question_body']
test_text = test['question_body']
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stop_words, analyzer='word', ngram_range=(1,3), norm='l2', min_df=15)
vectorizer.fit(train_text)
pickle.dump(vectorizer, open('./data-models/vectorizer.sav', 'wb'))
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['question_body'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['question_body'], axis=1)
# # Using pipeline for applying linearSVC and one vs rest classifier
SVC_pipeline = Pipeline([
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
])
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
])
# Using pipeline for applying Gaussian Naive Bayes and one vs rest classifier
NB_pipeline = Pipeline([
('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=-1)),
])
# Store models of each category
# filename = <model name> + <category name> + '.sav'
param_grid = {'clf__estimator__C': np.arange(1,5),
'clf__estimator__tol': [1, 0.01, 0.001, 0.0001, 0.00000001]}
nb_param_grid = {'clf__estimator__alpha': [1, 1e-1, 1e-2]}
for category in categories:
print('... Processing {}'.format(category))
svc_clf_cv = GridSearchCV(SVC_pipeline, param_grid, cv=10, scoring='f1_macro')
svc_clf_cv.fit(x_train, train[category])
print("SVC:")
print("Tuned Parameters: {}".format(svc_clf_cv.best_params_))
lr_clf_cv = GridSearchCV(LogReg_pipeline, param_grid, cv=10, scoring='f1_macro')
lr_clf_cv.fit(x_train, train[category])
print("Log Reg:")
print("Tuned Parameters: {}".format(lr_clf_cv.best_params_))
nb_clf_cv = GridSearchCV(NB_pipeline, nb_param_grid, cv=10, scoring='f1_macro')
nb_clf_cv.fit(x_train, train[category])
print("NB:")
print("Tuned Parameters: {}".format(nb_clf_cv.best_params_))
#Using the tuned parameters I created a model:
SVC2_pipeline = Pipeline([
('clf', OneVsRestClassifier(LinearSVC(C=svc_clf_cv.best_params_.get('clf__estimator__C'), tol = svc_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
])
SVC2_pipeline.fit(x_train, train[category])
# Store models of each category
# filename = <model name> + <category name> + '.sav'
filename = 'svc-' + category + '.sav'
pickle.dump(SVC2_pipeline, open('./data-models/' + filename, 'wb'))
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg2_pipeline = Pipeline([
('clf', OneVsRestClassifier(LogisticRegression(solver='sag', C=lr_clf_cv.best_params_.get('clf__estimator__C'), tol = lr_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
])
LogReg2_pipeline.fit(x_train, train[category])
# Store models of each category
# filename = <model name> + <category name> + '.sav'
filename = 'lr-' + category + '.sav'
pickle.dump(LogReg2_pipeline, open('./data-models/' + filename, 'wb'))
# Using pipeline for applying logistic regression and one vs rest classifier
NB2_pipeline = Pipeline([
('clf', OneVsRestClassifier(MultinomialNB(alpha=nb_clf_cv.best_params_.get('clf__estimator__alpha')), n_jobs=-1)),
])
NB2_pipeline.fit(x_train, train[category])
# Store models of each category
# filename = <model name> + <category name> + '.sav'
filename = 'nb-' + category + '.sav'
pickle.dump(NB2_pipeline, open('./data-models/' + filename, 'wb'))