我正在尝试在knn模型上运行k倍交叉验证。
这是我的完整代码。
首先,我导入数据。
df = read_csv("train.csv")
第二,我删除了缺失的值和离群值。
df = df.dropna()
outliers_feat = ['Feature_0', 'Feature_1', 'Evaporation', 'Feature_4', 'Feature_10',
'WindGustSpeed', 'Feature_21', 'Feature_23', 'Feature_24']
Q1 = df[outliers_feat].quantile(0.25)
Q3 = df[outliers_feat].quantile(0.75)
IQR = Q3 - Q1
print(IQR)
df_filtered = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis = 1)]
然后,我将数据拆分为训练集和验证集,然后对其进行缩放并使用PCA方法减小维数。
#first we decalre feature vector and target feature
X = df_filtered.drop(['label'], axis = 1)
y = df_filtered['label']
#next we split the data into seperate training and test set
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state = 42)
cols = X_train.columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = [cols])
X_validation_scaled = pd.DataFrame(X_validation_scaled, columns = [cols])
pca = PCA(n_components = 16)
pca.fit(X_train_scaled)
columns = ['pca_%i' % i for i in range(16)]
X_train_pca = pd.DataFrame(pca.transform(X_train_scaled), columns = columns, index = X_train_scaled.index)
pca = PCA(n_components = 16)
pca.fit(X_validation_scaled)
columns = ['pca_%i' % i for i in range(16)]
X_validation_pca = pd.DataFrame(pca.transform(X_validation_scaled), columns = columns, index = X_validation.index)
然后我适合knn分类器并运行k折CV:
knn_clf = KNeighborsClassifier(n_neighbors = 15, weights = "uniform", algorithm = 'auto',
leaf_size = 30, p = 2, metric = "minkowski",
metric_params = None, n_jobs = None)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc
cv = StratifiedKFold(n_splits = 10)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10,10))
i = 0
for train, test in cv.split(X_train_pca, y_train):
probas_ = knn_clf.fit(X_train_pca[train], y_train[train]).predict_proba(X_train[test])
fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
tprs.append(np.interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate',fontsize=18)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Cross-Validation ROC for K-NN model',fontsize=18)
plt.legend(loc="lower right", prop={'size': 15})
plt.show()
我收到了上面提到的错误。