Question

我想使用嵌套CV方法从SVC中找到最佳参数：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X, y = load_breast_cancer(return_X_y=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pipe_svc = make_pipeline(Imputer(),StandardScaler(),PCA(n_components=2),SVC(random_state=1))
param_range = [0.001,0.01,0.1,1,10,100,1000]
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range,'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, scoring='accuracy',n_jobs=4, cv = 2)
scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)

scores
# how do I get the best parameters out from gridsearch after cross_val?

Out[]: array([0.925     , 0.9375    , 0.925     , 0.95      , 0.94871795])

gs.best_estimator_

Out[]: Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)...ar',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False))])

最后一行代码仅给出5个准确性得分。 gs.bestestimator_也不产生任何有用的信息。在管道中将GridSearchCV与Cross_Val结合的最佳方法是什么？

Answer 1

好吧，您不必使用cross_val_score，就可以在交叉验证期间以及找到最佳估计量后获得所有信息和元结果。

请考虑以下示例：

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipe_svc = make_pipeline(Imputer(),StandardScaler(),PCA(n_components=2),SVC(random_state=1))
param_range = [0.001,0.01,0.1,1,10,100,1000]
param_grid = {'svc__C': [0.001,0.01,0.1,1,10,100,1000], 'svc__kernel': ['linear', 'rbf'],
              'svc__gamma': [0.001,0.01,0.1,1,10,100,1000]}
cv = StratifiedKFold(n_splits=5)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, scoring='accuracy', cv = cv,
                  return_train_score=True)
gs.fit(X_train, y_train)

print("Best Estimator: \n{}\n".format(gs.best_estimator_))
print("Best Parameters: \n{}\n".format(gs.best_params_))
print("Best Test Score: \n{}\n".format(gs.best_score_))
print("Best Training Score: \n{}\n".format(gs.cv_results_['mean_train_score'][gs.best_index_]))
print("All Training Scores: \n{}\n".format(gs.cv_results_['mean_train_score']))
print("All Test Scores: \n{}\n".format(gs.cv_results_['mean_test_score']))
# # This prints out all results during Cross-Validation in details
#print("All Meta Results During CV Search: \n{}\n".format(gs.cv_results_))

输出

Best Estimator: 
Pipeline(memory=None,
         steps=[('imputer', Imputer(axis=0, copy=True,
         missing_values='NaN', strategy='mean', verbose=0)),
         ('standardscaler', StandardScaler(copy=True, with_mean=True, 
         with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', 
         n_components=2, random_state=None,
         svd_solver='auto', tol=0.0, whiten=False)...ar',
         max_iter=-1, probability=False, random_state=1, shrinking=True,
         tol=0.001, verbose=False))])

Best Parameters: 
{'svc__gamma': 0.001, 'svc__kernel': 'linear', 'svc__C': 1}

Best Test Score: 
0.9422110552763819

Best Training Score: 
0.9440783896216558

All Training Scores: 
[0.90012027 0.64070503 0.90012027 0.64070503 0.90012027 0.64070503
 0.90012027 0.64070503 0.90012027 0.64070503 0.90012027 0.64070503
 0.90012027 0.64070503 0.92587291 0.64070503 0.92587291 0.64070503
 0.92587291 0.64070503 0.92587291 0.64070503 0.92587291 0.64070503
 0.92587291 0.64070503 0.92587291 0.64070503 0.93779697 0.68906962
 0.93779697 0.91582382 0.93779697 0.92901362 0.93779697 0.88879951
 0.93779697 0.64070503 0.93779697 0.64070503 0.93779697 0.64070503
 0.94407839 0.91394491 0.94407839 0.93277932 0.94407839 0.93968376
 0.94407839 0.95413931 0.94407839 0.98052483 0.94407839 0.9949725
 0.94407839 0.99937304 0.94533822 0.93090042 0.94533822 0.94345143
 0.94533822 0.94911575 0.94533822 0.96293448 0.94533822 0.99434357
 0.94533822 1.         0.94533822 1.         0.94533822 0.94219554
 0.94533822 0.94219357 0.94533822 0.95099466 0.94533822 0.98052286
 0.94533822 1.         0.94533822 1.         0.94533822 1.
 0.94596518 0.9428225  0.94596518 0.94345537 0.94596518 0.95539323
 0.94596518 0.99371858 0.94596518 1.         0.94596518 1.
 0.94596518 1.        ]

All Test Scores: 
[0.88944724 0.64070352 0.88944724 0.64070352 0.88944724 0.64070352
 0.88944724 0.64070352 0.88944724 0.64070352 0.88944724 0.64070352
 0.88944724 0.64070352 0.92713568 0.64070352 0.92713568 0.64070352
 0.92713568 0.64070352 0.92713568 0.64070352 0.92713568 0.64070352
 0.92713568 0.64070352 0.92713568 0.64070352 0.9321608  0.68090452
 0.9321608  0.90954774 0.9321608  0.92211055 0.9321608  0.84422111
 0.9321608  0.64070352 0.9321608  0.64070352 0.9321608  0.64070352
 0.94221106 0.9120603  0.94221106 0.92713568 0.94221106 0.91959799
 0.94221106 0.93969849 0.94221106 0.81407035 0.94221106 0.65075377
 0.94221106 0.64572864 0.94221106 0.92964824 0.94221106 0.92964824
 0.94221106 0.92462312 0.94221106 0.92211055 0.94221106 0.80653266
 0.94221106 0.65326633 0.94221106 0.64572864 0.94221106 0.92964824
 0.94221106 0.93969849 0.94221106 0.92713568 0.94221106 0.90954774
 0.94221106 0.82663317 0.94221106 0.65326633 0.94221106 0.64572864
 0.93969849 0.94221106 0.93969849 0.93467337 0.93969849 0.92964824
 0.93969849 0.87939698 0.93969849 0.8241206  0.93969849 0.65326633
 0.93969849 0.64572864]

如何从流水线网格搜索和cross_val_score中获得最佳估计器和参数？

1 个答案: