在没有cross_val_score的情况下执行交叉验证

时间:2018-04-11 19:25:52

标签: scikit-learn nested cross-validation

为了完全访问内部和外部分数,我想在不使用cross_val_score的情况下创建嵌套的cros验证和网格搜索。

我已经按照我在网上找到的例子https://github.com/rasbt/pattern_classification/blob/master/data_viz/model-evaluation-articles/nested_cv_code.ipynb

我怀疑内巢是好的。我不确定在调用GridSearchCV之前是否必须拆分数据:

    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
# inner cross-validation
            for name, gs_est in sorted(gridcvs.items()):
                #print(gs_est)
                gs_est.fit(X_train_inner, y_train_inner)
                y_pred = gs_est.predict(X_test_inner)
                #print(y_test_inner.shape)
                inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
                cv_scores[name].append(inner_score)
                #for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'], 
                                              #gs_est.cv_results_ ['params']):
                                             #print(name, params, mean_score)
    print('print cvscores for model:', cv_scores)                   
    outer_counter = outer_counter + 1   

整个代码:

import numpy as np
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import operator


perf_list = []     # list with the performance
hp_list = []      # hyperparameter list
algo_familiy = [] # algorithm familiy list


##################################################################################################
randomState=1
average_scores_across_outer_folds_for_each_model = dict()
X, y = make_regression(n_samples=1000, n_features=10)
##################################################################################################
# Create X_test,   y_test = TEST SET
# Create X_train, y_train = TRAIN & VALIDATION SET
X_train, X_gtest, y_train, y_gtest= train_test_split(X, y, train_size=0.8, random_state=randomState)
print(X_train.shape)
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)
##################################################################################################
##################################################################
# Regressors you want to use 
reg1 = KNeighborsRegressor()
reg2 = RandomForestRegressor()

# Building the pipelines (Transformer, Classifier)
pipe1 = Pipeline([('std' , StandardScaler()),
                  ('reg1', reg1)])

pipe2 = Pipeline([('std' , StandardScaler()),
                  ('reg2', reg2)])

# Setting up parameters for grid
param_grid1 = [{'reg1__n_neighbors': list(range(7, 10))}]

param_grid2 = [{'reg2__max_depth': [50, 20]}]

# outer cross-validation
outer_counter = 1
outer_cv = KFold(n_splits=3, shuffle=True)
inner_cv = KFold(n_splits=2, shuffle=True, random_state=randomState)
################################################################## 
###########################
gridcvs = {}
for pgrid, est, name in zip((param_grid1, param_grid2),
                            (pipe1, pipe2),
                            ('KNN', 'RF')):
    regressor_that_optimizes_its_hyperparams = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='r2',
                       n_jobs=1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = regressor_that_optimizes_its_hyperparams
################################################################## 
################################################################## 
for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
    print('outer_cv', outer_counter)
    X_train_outer = X_train[train_index_outer]
    y_train_outer = y_train[train_index_outer]
    X_test_outer  = X_train[test_index_outer]
    y_test_outer  = y_train[test_index_outer]
#    print(X_train_outer.shape)
#    print(X_test_outer.shape)
    cv_scores = {name: [] for name, gs_est in gridcvs.items()}
    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
# inner cross-validation
            for name, gs_est in sorted(gridcvs.items()):
                #print(gs_est)
                gs_est.fit(X_train_inner, y_train_inner)
                y_pred = gs_est.predict(X_test_inner)
                #print(y_test_inner.shape)
                inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
                cv_scores[name].append(inner_score)
                #for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'], 
                                              #gs_est.cv_results_ ['params']):
                                             #print(name, params, mean_score)
    print('print cvscores for model:', cv_scores)                   
    outer_counter = outer_counter + 1
# Looking at the results        
#####################################################################        
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))

many_stars = '\n' + '*' * 100 + '\n'
print(many_stars + 'Now we choose the best model and refit on the whole dataset' + many_stars) 

# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['RF']

best_algo.fit(X_train, y_train)
train_acc = r2_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = r2_score(y_true=y_gtest, y_pred=best_algo.predict(X_gtest))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RF'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
# Fitting a model to the whole dataset
# using the "best" algorithm and hyperparameter settings
best_clf = best_algo.best_estimator_
final_model = best_clf.fit(X, y)

1 个答案:

答案 0 :(得分:1)

通常,您可以使用您发布的代码获得嵌套的交叉验证。

for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
    print('outer_cv', outer_counter)
    X_train_outer = X_train[train_index_outer]
    y_train_outer = y_train[train_index_outer]
    X_test_outer  = X_train[test_index_outer]
    y_test_outer  = y_train[test_index_outer]
    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
            # fit something on X_train_inner 
            # evaluate it on X_test_inner  

或者您可以执行以下操作: 如果您传递GridSearchCV参数cv inner_cv,则GridSearchCV会在您调用.fit()方法时自动执行拆分。当拟合完成后,您可以探索.cv_results以获得每个自动生成的内部折叠上的单个模型分数。

   for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
        X_train_outer = X_train[train_index_outer]
        y_train_outer = y_train[train_index_outer]
        X_test_outer  = X_train[test_index_outer]
        y_test_outer  = y_train[test_index_outer]

        cv= GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='r2',
                       n_jobs=1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
        cv.fit(X_train_outer,y_train_outer)