Question

在对正则化强度参数和最近邻数参数进行逆进行逻辑回归，线性SVM和K最近邻分类器上进行网格搜索时，从网格搜索获得的最佳参数实际上不是在通过相同训练进行手动验证时获得的最佳参数数据集。下面的代码

# Convert to a DataFrame.
import pandas as pd
from sklearn.datasets import fetch_openml

df = fetch_openml('credit-g', as_frame=True).frame
df.head(5)

df.dtypes

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12, 12))
st = fig.suptitle("univariate distributions and target distribution", fontsize=20)

# Using columns that we need for this plot
nfeatures = df[['duration', 'credit_amount' , 'age']]
target = df['class']

# creating 4x4 grid
grid = plt.GridSpec(4, 4, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3 and 4
p1 = fig.add_subplot(grid[:2,:2])
p2 = fig.add_subplot(grid[:2,2:])
p3 = fig.add_subplot(grid[2:,:2])
p4 = fig.add_subplot(grid[2:,2:])

p1.hist(nfeatures['duration'])
p2.hist(nfeatures['credit_amount'])
p3.hist(nfeatures['age'])
p4.hist(target)

p1.set_xlabel('duration')
p2.set_xlabel('credit_amount')
p3.set_xlabel('age')
p4.set_xlabel('class')
# customizing to look neat
st.set_y(0.95)
fig.subplots_adjust(top=0.92)


from sklearn.model_selection import train_test_split

columns = [column for column  in df.columns if column != 'class']
X = df[columns]
y = df['class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=11)
#X_train , y_train , X_valid , y_valid = train_test_split(X,) 
# basic preprocessing on train sets
# numeric_columns = ['duration','credit_amount' , 'installment_commitment' , 'residence_since' , 'age' ,'existing_credits' , 'num_dependents' ]
numeric_columns = df.select_dtypes(include=['float64']).columns
categorical_columns = [column for column in columns if column not in numeric_columns]
temp = X_train[categorical_columns]
X_train_ohe = pd.concat([pd.get_dummies(temp),X_train[numeric_columns]],axis=1)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(max_iter=1000)

cr = cross_val_score(lr,X_train_ohe,y_train)

print(cr)


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# define the data preparation for the categorical columns
t1 = [('cat', OneHotEncoder(), categorical_columns)]
col_transform = ColumnTransformer(transformers=t1)
# define the models
models = {'lr_model':LogisticRegression(max_iter=1000), 'lsvm_model':LinearSVC(max_iter=2500) , 'knn_model':KNeighborsClassifier()}

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

# define the data preparation for the categorical columns and numeric columns
t2 = [('cat', OneHotEncoder(), categorical_columns), ('num', StandardScaler(), numeric_columns)]
col_transform = ColumnTransformer(transformers=t2)
# try with new column transformer
for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

f1_scorer = make_scorer(f1_score, pos_label="bad")

# 'prep__num__with_mean': [True, False],
# 'prep__num__with_std': [True, False],
param_grid = {
    'm__C': [0.1, 1.0 , 0.01],
    }

param_grid_knn = {
    'm__n_neighbors': [5, 10 , 15],
    }

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  if name == 'knn_model':
      grid_clf = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring=f1_scorer )
  else:
      grid_clf = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer)
  grid_clf.fit(X_train, y_train)
  print(name,grid_clf.best_params_)
  print(name, grid_clf.best_estimator_.score(X_test, y_test))

lr_array = []
lr_c = [0.01,0.1,1]

for c in lr_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))


lsvm_array = []
lsvm_c = [0.01,0.1,1]

for c in lsvm_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LinearSVC(dual=True,max_iter=2500,C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lsvm_array.append(f1_score(y_train,y_hat,pos_label="bad"))


knn_array = []
knn_n = [5,10,15]

for n in knn_n:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', KNeighborsClassifier(n_neighbors=n))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  knn_array.append(f1_score(y_train,y_hat,pos_label="bad"))

fig = plt.figure(figsize=(12, 12))
# creating 3x1 grid
grid = plt.GridSpec(3, 1, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3
p1 = fig.add_subplot(grid[0,:])
p2 = fig.add_subplot(grid[1,:])
p3 = fig.add_subplot(grid[2,:])

p1.scatter(lr_c,lr_array)
p2.scatter(lsvm_c,lsvm_array)
p3.scatter(knn_n,knn_array)

当使用不同的分数并在测试集而不是训练集上进行评估时，趋势会发生变化，但是对于网格搜索和手动验证，最佳参数似乎永远不会相同。这可能是什么原因？例如，如果您运行上述代码，网格搜索会告诉您10是n_neighbors的最佳值，但最后的图形显示5的效果更好。比较未正确实现吗？您可以在此链接https://github.com/binodmathews93/AppliedMachineLearningCourse/blob/master/Applied_Machine_Learning_Homework_2.ipynb

中查看运行结果

Answer 1

超参数调整是在验证（开发）集而不是训练集上进行的。

网格搜索交叉验证正在使用K折策略来构建仅用于验证而不用于训练的验证集。

您正在同一组上手动执行训练和验证，这是不正确的方法。

pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
pipeline.fit(X_train,y_train)       # <- here is the problem
y_hat = pipeline.predict(X_train)
lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))

这只会导致选择超参数，从而提高训练集上的性能，而这并不是您想要的（您需要一组能使测试集具有良好性能的超参数-泛化得很好）。

这就是为什么在进行手动测试时K（以KNN为单位）较低的原因-较低的K导致较少的“正则化”，因此从训练集的角度来看是最佳的选择（尽管不正确）。 / p>

如果要手动验证结果，则需要自己构建验证集（并且在培训期间不要使用它），或者需要手动调用K折交叉验证程序。 / p>

网格搜索未提供最佳参数

1 个答案: