Question

我创建了一个用Python编码的支持向量机（SVM）。由于这是一种多标签分类一对多方法，因此我创建了一个SVM模型FOR EACH标签。现在，当我测试SVM模型时，我不明白为什么我的某些标签的f1分数低。原因之一可能是我的训练数据可能不如我想的那么准确。但这足以说明问题吗？请帮助我思考为什么会发生这种情况。我刚刚开始学习有关机器学习的知识，但我对此仍然很陌生。请帮助我通过我的论文。谢谢。

已附上Average f1 scores per label

的图片

这是我的代码的某些部分：

    # Connect to the database

train, test = train_test_split(df, random_state=42, train_size=0.80, test_size=0.20, shuffle=True)

train_text = train['question_body']
test_text = test['question_body']

vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stop_words, analyzer='word', ngram_range=(1,3), norm='l2', min_df=15)
vectorizer.fit(train_text)
pickle.dump(vectorizer, open('./data-models/vectorizer.sav', 'wb'))

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['question_body'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['question_body'], axis=1)

# # Using pipeline for applying linearSVC and one vs rest classifier
SVC_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
            ])
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
# Using pipeline for applying Gaussian Naive Bayes and one vs rest classifier
NB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=-1)),
            ])

# Store models of each category
# filename = <model name> + <category name> + '.sav'
param_grid = {'clf__estimator__C': np.arange(1,5), 
                'clf__estimator__tol': [1, 0.01, 0.001, 0.0001, 0.00000001]}
nb_param_grid = {'clf__estimator__alpha': [1, 1e-1, 1e-2]}

for category in categories:

    print('... Processing {}'.format(category))
    svc_clf_cv = GridSearchCV(SVC_pipeline, param_grid, cv=10, scoring='f1_macro')
    svc_clf_cv.fit(x_train, train[category])
    print("SVC:")
    print("Tuned Parameters: {}".format(svc_clf_cv.best_params_))

    lr_clf_cv = GridSearchCV(LogReg_pipeline, param_grid, cv=10, scoring='f1_macro')
    lr_clf_cv.fit(x_train, train[category])
    print("Log Reg:")
    print("Tuned Parameters: {}".format(lr_clf_cv.best_params_))


    nb_clf_cv = GridSearchCV(NB_pipeline, nb_param_grid, cv=10, scoring='f1_macro')
    nb_clf_cv.fit(x_train, train[category])
    print("NB:")
    print("Tuned Parameters: {}".format(nb_clf_cv.best_params_))

    #Using the tuned parameters I created a model:
    SVC2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(C=svc_clf_cv.best_params_.get('clf__estimator__C'), tol = svc_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
            ])
    SVC2_pipeline.fit(x_train, train[category])

    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'svc-' + category + '.sav'
    pickle.dump(SVC2_pipeline, open('./data-models/' + filename, 'wb'))

    # Using pipeline for applying logistic regression and one vs rest classifier
    LogReg2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', C=lr_clf_cv.best_params_.get('clf__estimator__C'), tol = lr_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
            ])
    LogReg2_pipeline.fit(x_train, train[category])
    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'lr-' + category + '.sav'
    pickle.dump(LogReg2_pipeline, open('./data-models/' + filename, 'wb'))


    # Using pipeline for applying logistic regression and one vs rest classifier
    NB2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(alpha=nb_clf_cv.best_params_.get('clf__estimator__alpha')), n_jobs=-1)),
            ])
    NB2_pipeline.fit(x_train, train[category])
    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'nb-' + category + '.sav'
    pickle.dump(NB2_pipeline, open('./data-models/' + filename, 'wb'))

使用SVM进行多标签标记。为什么有些标签的f1分数低？

0 个答案: