如何在GridSearchCV中对数据进行标准化?
这是代码。我不知道该怎么做。
import dataset
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
dataset = pd.read_excel('../dataset/dataset_experiment1.xlsx')
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,66].values
from sklearn.model_selection import GridSearchCV
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
stdizer = StandardScaler()
print('===Grid Search===')
print('logistic regression')
model = LogisticRegression()
parameter_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=kfold, scoring = scoring3)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('\n')
更新 这是我尝试运行但得到错误:
print('logistic regression')
model = LogisticRegression()
pipeline = Pipeline([('scale', StandardScaler()), ('clf', model)])
parameter_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(pipeline, param_grid=parameter_grid, cv=kfold, scoring = scoring3)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('\n')
答案 0 :(得分:0)
演示:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.33)
pipe = Pipeline([
('scale', StandardScaler()),
('clf', LogisticRegression())
])
param_grid = [
{
'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'clf__C': np.logspace(-3, 1, 5),
},
]
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
答案 1 :(得分:0)
如果您使用 refit=True,那么您可以使用来自 GridSearchCV 的最佳模型结果。您可以使用 cv_results 根据排名分数找到最佳行。使用最佳行然后可以提取参数。如果您的特征列表变大,请使用 RandomSearchCV 进行预测。
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3)
pipe = Pipeline([
('scale', StandardScaler()),
('clf', LogisticRegression())
])
param_grid = [
{
'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'clf__C': np.logspace(-3, 1, 5),
},
]
grid_class=GridSearchCV(
estimator=pipeline,
param_grid=parameter_grid,
scoring='accuracy',
n_jobs=4, #use 4 cores
cv=10, #10 folds
refit=True,
return_train_score=True)
grid_class.fit(X_train,y_train)
predictions=grid_class.predict(X_test)
cv_results_df=pd.DataFrame(grid_class.cv_results_)
best_row=cv_results_df[cv_results_df["rank_test_score"]==1]
print(best_row)
params_column = cv_results_df.loc[:, ['params']]
print(params_column)