Question

我正在使用此代码使用 SMOTE 对原始数据进行过采样，然后使用交叉验证训练随机森林模型。

y = df.target
X = df.drop('target', axis=1)

imba_pipeline = make_pipeline(SMOTE(random_state=27,  sampling_strategy=1.0), 
                              RandomForestClassifier(n_estimators=200, random_state = 42))


f1_score = cross_val_score(imba_pipeline, X, y, scoring='f1_weighted', cv=5)
roc_auc_score = cross_val_score(imba_pipeline, X, y, scoring='roc_auc', cv=5)

print("F1: %0.4f " % (f1_score.mean()))
print("ROC-AUC: %0.4f " % (roc_auc_score.mean()))

The output is : 
F1: 0.9336 
ROC-AUC: 0.6589

现在，我的问题是在这种情况下如何绘制 ROC 曲线？

在我们将数据拆分为训练和测试的正常情况下，我使用以下代码：

y = df.target
X = df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=27)

sm = SMOTE(random_state=27, sampling_strategy=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)

smote_rf =RandomForestClassifier(n_estimators=200, random_state = 42).fit(X_train, y_train)

smote_pred_rf = smote_rf.predict_proba(X_test)[:,1]

false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, smote_pred_rf)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, smote_pred_rf))

# plot ROC

plt.figure()

auc_smote = auc(false_positive_rate1, true_positive_rate1)
plt.plot(false_positive_rate1, true_positive_rate1, color='red',lw = 1, label='SMOTE (auc= %0.5f)' % auc_smote)


plt.plot([0, 1], [0, 1], lw = 1, color='black',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Abalone Data Set (RF)', fontweight='bold')
plt.legend(loc="lower right")
plt.show()

Answer 1

首先，我认为您应该为您想要的每个指标运行 1 次交叉验证，而不是新的交叉验证。那是在浪费资源，而且您不会为这些指标衡量相同的模型。

为此，请参阅函数 cross_validate (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate)

示例：

>>> scores = cross_validate(lasso, X, y, cv=3,
...                         scoring=('r2', 'neg_mean_squared_error'),
...                         return_train_score=True)
>>> print(scores['test_neg_mean_squared_error'])
[-3635.5... -3573.3... -6114.7...]
>>> print(scores['train_r2'])
[0.28010158 0.39088426 0.22784852]

特别是对于 ROC 曲线，您可能需要更详细地了解并从每一轮交叉验证中获取预测。 sklearn 网站上的这个示例展示了一种方法：https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html

复制粘贴如下：

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold

# #############################################################################
# Data IO and generation

# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape

# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# #############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
                     random_state=random_state)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = plot_roc_curve(classifier, X[test], y[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic example")
ax.legend(loc="lower right")
plt.show()

从交叉验证绘制 ROC 曲线

1 个答案: