为了完全访问内部和外部分数,我想在不使用cross_val_score的情况下创建嵌套的cros验证和网格搜索。
我已经按照我在网上找到的例子https://github.com/rasbt/pattern_classification/blob/master/data_viz/model-evaluation-articles/nested_cv_code.ipynb。
我怀疑内巢是好的。我不确定在调用GridSearchCV之前是否必须拆分数据:
for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
X_train_inner = X_train_outer[train_index_inner]
y_train_inner = y_train_outer[train_index_inner]
X_test_inner = X_train_outer[test_index_inner]
y_test_inner = y_train_outer[test_index_inner]
# inner cross-validation
for name, gs_est in sorted(gridcvs.items()):
#print(gs_est)
gs_est.fit(X_train_inner, y_train_inner)
y_pred = gs_est.predict(X_test_inner)
#print(y_test_inner.shape)
inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
cv_scores[name].append(inner_score)
#for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'],
#gs_est.cv_results_ ['params']):
#print(name, params, mean_score)
print('print cvscores for model:', cv_scores)
outer_counter = outer_counter + 1
整个代码:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import operator
perf_list = [] # list with the performance
hp_list = [] # hyperparameter list
algo_familiy = [] # algorithm familiy list
##################################################################################################
randomState=1
average_scores_across_outer_folds_for_each_model = dict()
X, y = make_regression(n_samples=1000, n_features=10)
##################################################################################################
# Create X_test, y_test = TEST SET
# Create X_train, y_train = TRAIN & VALIDATION SET
X_train, X_gtest, y_train, y_gtest= train_test_split(X, y, train_size=0.8, random_state=randomState)
print(X_train.shape)
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)
##################################################################################################
##################################################################
# Regressors you want to use
reg1 = KNeighborsRegressor()
reg2 = RandomForestRegressor()
# Building the pipelines (Transformer, Classifier)
pipe1 = Pipeline([('std' , StandardScaler()),
('reg1', reg1)])
pipe2 = Pipeline([('std' , StandardScaler()),
('reg2', reg2)])
# Setting up parameters for grid
param_grid1 = [{'reg1__n_neighbors': list(range(7, 10))}]
param_grid2 = [{'reg2__max_depth': [50, 20]}]
# outer cross-validation
outer_counter = 1
outer_cv = KFold(n_splits=3, shuffle=True)
inner_cv = KFold(n_splits=2, shuffle=True, random_state=randomState)
##################################################################
###########################
gridcvs = {}
for pgrid, est, name in zip((param_grid1, param_grid2),
(pipe1, pipe2),
('KNN', 'RF')):
regressor_that_optimizes_its_hyperparams = GridSearchCV(estimator=est,
param_grid=pgrid,
scoring='r2',
n_jobs=1,
cv=inner_cv,
verbose=0,
refit=True)
gridcvs[name] = regressor_that_optimizes_its_hyperparams
##################################################################
##################################################################
for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train):
print('outer_cv', outer_counter)
X_train_outer = X_train[train_index_outer]
y_train_outer = y_train[train_index_outer]
X_test_outer = X_train[test_index_outer]
y_test_outer = y_train[test_index_outer]
# print(X_train_outer.shape)
# print(X_test_outer.shape)
cv_scores = {name: [] for name, gs_est in gridcvs.items()}
for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
X_train_inner = X_train_outer[train_index_inner]
y_train_inner = y_train_outer[train_index_inner]
X_test_inner = X_train_outer[test_index_inner]
y_test_inner = y_train_outer[test_index_inner]
# inner cross-validation
for name, gs_est in sorted(gridcvs.items()):
#print(gs_est)
gs_est.fit(X_train_inner, y_train_inner)
y_pred = gs_est.predict(X_test_inner)
#print(y_test_inner.shape)
inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
cv_scores[name].append(inner_score)
#for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'],
#gs_est.cv_results_ ['params']):
#print(name, params, mean_score)
print('print cvscores for model:', cv_scores)
outer_counter = outer_counter + 1
# Looking at the results
#####################################################################
for name in cv_scores:
print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
many_stars = '\n' + '*' * 100 + '\n'
print(many_stars + 'Now we choose the best model and refit on the whole dataset' + many_stars)
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['RF']
best_algo.fit(X_train, y_train)
train_acc = r2_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = r2_score(y_true=y_gtest, y_pred=best_algo.predict(X_gtest))
print('Accuracy %.2f%% (average over CV test folds)' %
(100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RF'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
# Fitting a model to the whole dataset
# using the "best" algorithm and hyperparameter settings
best_clf = best_algo.best_estimator_
final_model = best_clf.fit(X, y)
答案 0 :(得分:1)
通常,您可以使用您发布的代码获得嵌套的交叉验证。
for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train):
print('outer_cv', outer_counter)
X_train_outer = X_train[train_index_outer]
y_train_outer = y_train[train_index_outer]
X_test_outer = X_train[test_index_outer]
y_test_outer = y_train[test_index_outer]
for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
X_train_inner = X_train_outer[train_index_inner]
y_train_inner = y_train_outer[train_index_inner]
X_test_inner = X_train_outer[test_index_inner]
y_test_inner = y_train_outer[test_index_inner]
# fit something on X_train_inner
# evaluate it on X_test_inner
或者您可以执行以下操作:
如果您传递GridSearchCV
参数cv
inner_cv
,则GridSearchCV
会在您调用.fit()
方法时自动执行拆分。当拟合完成后,您可以探索.cv_results
以获得每个自动生成的内部折叠上的单个模型分数。
for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train):
X_train_outer = X_train[train_index_outer]
y_train_outer = y_train[train_index_outer]
X_test_outer = X_train[test_index_outer]
y_test_outer = y_train[test_index_outer]
cv= GridSearchCV(estimator=est,
param_grid=pgrid,
scoring='r2',
n_jobs=1,
cv=inner_cv,
verbose=0,
refit=True)
cv.fit(X_train_outer,y_train_outer)