# import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler   

# scaler and encoder options
scaler = StandardScaler()   # there are 3 options that I want to try
encoder = OneHotEncoder()   # only one option, no need to GridSearch it

# use ColumnTransformer to apply different preprocesses to numerical and categorical columns
preprocessor = ColumnTransformer(transformers = [('categorical', encoder, cat_columns),
                                                 ('numerical', scaler, num_columns),

# combine the preprocessor with LogisticRegression() using Pipeline 
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                  ('log_reg', LogisticRegression())])


from sklearn.model_selection import GridSearchCV

# set params combination I want to try
scaler_options = {'numerical':[StandardScaler(), RobustScaler(), MinMaxScaler()]}

# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options, cv = 5)

# fit the data 
grid_cv.fit(X_train, y_train)



2 个答案:

答案 0 :(得分:1)

正如您所建议的,您可以创建一个 class,该 __init()__ 接受其 class ScalerSelector(BaseEstimator, TransformerMixin): def __init__(self, scaler=StandardScaler()): super().__init__() self.scaler = scaler def fit(self, X, y=None): return self.scaler.fit(X) def transform(self, X, y=None): return self.scaler.transform(X) 参数,即您要使用的缩放器


# import dependencies
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler   
from sklearn.datasets import load_breast_cancer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler   

import pandas as pd

class ScalerSelector(BaseEstimator, TransformerMixin):
    def __init__(self, scaler=StandardScaler()):
        self.scaler = scaler

    def fit(self, X, y=None):
        return self.scaler.fit(X)

    def transform(self, X, y=None):
        return self.scaler.transform(X)

data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()

# scaler and encoder options
my_scaler = ScalerSelector()

preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)

# combine the preprocessor with LogisticRegression() using Pipeline 
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                  ('log_reg', LogisticRegression())

# set params combination I want to try
scaler_options = {'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}

# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)

# fit the data 
grid_cv.fit(data, target)

# best params :


答案 1 :(得分:0)

您可以无需创建自定义转换器即可实现您的意图。您甚至可以将 'passthrough' 参数传递给 param_grid 以试验您根本不想在该步骤中进行任何缩放的场景。

在这个例子中,假设我们想研究模型对数值特征强加一个 Scaler 变换器是否更好,num_features

cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
num_features = selector(dtype_include='number')(train.drop('target', axis=1))

cat_preprocessor = Pipeline(steps=[
    ('oh', OneHotEncoder(handle_unknown='ignore')),
    ('ss', StandardScaler()) 
num_preprocessor = Pipeline(steps=[ 
    ('pt', PowerTransformer(method='yeo-johnson')),
    ('ss', StandardScaler()) # Create a place holder for your test here !!!                                   
preprocessor = ColumnTransformer(transformers=[ 
    ('cat', cat_preprocessor, cat_features),
    ('num', num_preprocessor, num_features)                                                       
model = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf', RidgeClassifier())
X = train.drop('target', axis=1)
y = train['target']
param_grid = {
    'prep__cat__ss': ['passthrough', StandardScaler(with_mean=False)] # 'passthrough', 
gs = GridSearchCV(
gs.fit(X, y)