当我从girdsearch的管道中删除分类器之一时,它变得太慢了。我不明白背后的原因。
numeric_transformer = Pipeline(steps=[
('simp_imp1', SimpleImputer()),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('simp_imp2', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
ordinal_tranformer = Pipeline(steps=[
('simp_imp3', SimpleImputer(strategy='most_frequent')),
('ordinal', OrdinalEncoder())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
('ord', ordinal_tranformer, ordinal_features)],
remainder='passthrough')
clf1 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
clf2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf3 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', ElasticNet())])
clf4 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
clf5 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LinearSVC())])
clf6 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', KNeighborsClassifier())])
clf7 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())])
clf8 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', AdaBoostClassifier())])
clf9 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier())])
clf10 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', HistGradientBoostingClassifier())])
clf11 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', MLPClassifier())])
vc=VotingClassifier(estimators=[('clf1',clf1) , ('clf2',clf2), ('clf3',clf3), ('clf4',clf4), ('clf5',clf5),
('clf6',clf6), ('clf7',clf7), ('clf8',clf8), ('clf9',clf9), ('clf10',clf10),
('clf11',clf11)])
params_grid = [{
'clf1__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf1__classifier__C': np.logspace(-3,3,7),
'clf1__classifier__penalty':["l1","l2"],
'clf1__classifier' : [LogisticRegression()]
},
{
'clf2__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf2__classifier__n_estimators': [int(x) for x in np.linspace(start = 100, stop = 200, num = 50)],
'clf2__classifier__max_features': ['auto', 'sqrt', 'log2'],
'clf2__classifier__max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
'clf2__classifier__min_samples_split': [2, 5, 10],
'clf2__classifier__min_samples_leaf': [1, 2, 4],
'clf2__classifier__bootstrap': [True, False],
'clf2__classifier' : [RandomForestClassifier()],
},
{
'clf3__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf3__classifier__max_iter': [1, 5, 10],
'clf3__classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
"clf3__classifier__l1_ratio": np.arange(0.0, 1.0, 0.1),
'clf3__classifier': [ElasticNet()]
},
{
'clf4__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf4__classifier__C':[1,10,100,1000],
'clf4__classifier__gamma':[1,0.1,0.01, 0.001,0.0001],
'clf4__classifier__kernel':['linear','rbf','poly'],
'clf4__classifier': [SVC()]
},
{
'clf5__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf5__classifier__C':[1,10,100,1000],
'clf5__classifier__penalty': ['l1', 'l2'],
'clf5__classifier__loss': ['hinge', 'squared_hinge'],
'clf5__classifier': [LinearSVC()]
},
{
'clf6__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf6__classifier__n_neighbors':np.arange(1,30),
'clf6__classifier__weights':['uniform', 'distance'],
'clf6__classifier__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
'clf6__classifier': [KNeighborsClassifier()]
},
{
'clf7__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf7__classifier__criterion': ['entropy','gini'],
'clf7__classifier__max_depth': np.arange(2, 16, 2),
'clf7__classifier__min_samples_split': [5,10,15,20,25,30],
'clf7__classifier__min_samples_leaf': [2,3,4,5,6,7,8,9],
'clf7__classifier': [DecisionTreeClassifier()]
},
{
'clf8__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf8__classifier__n_estimators': [50, 100, 150, 200],
'clf8__classifier__learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
'clf8__classifier__algorithm': ['SAMME', 'SAMME.R'],
'clf8__classifier': [AdaBoostClassifier()]
},
{
'clf9__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf9__classifier__learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
'clf9__classifier__min_samples_split': np.linspace(0.1, 0.5, 12),
'clf9__classifier__min_samples_leaf': np.linspace(0.1, 0.5, 12),
'clf9__classifier__max_depth':[3,5,8],
'clf9__classifier__max_features':["log2","sqrt"],
'clf9__classifier__criterion': ["friedman_mse", "mae"],
'clf9__classifier__subsample':[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
'clf9__classifier__n_estimators':[100,200],
'clf9__classifier': [GradientBoostingClassifier()]
},
{
'clf10__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf10__classifier__loss': ['auto', 'binary_crossentropy', 'categorical_crossentropy'],
'clf10__classifier__learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
'clf10__classifier__max_iter': [100,400,700,1000],
'clf10__classifier__min_samples_leaf': [20, 40, 60, 80, 100],
'clf10__classifier__l2_regularization': [0, 0.2, 0.4, 0.6, 0.8, 1],
'clf10__classifier': [HistGradientBoostingClassifier()]
},
{
'clf11__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf11__classifier__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'clf11__classifier__solver': ['sgd', 'adam'],
'clf11__classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
'clf11__classifier__learning_rate': ['constant','adaptive'],
'clf11__classifier': [MLPClassifier()]
}
]
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ["Target"]), df["Target"], test_size=0.3)
grid_search = GridSearchCV(vc, params_grid, cv=5, scoring = 'f1', verbose=50, n_jobs=-1)
gs = grid_search.fit(X_train, y_train)
print(("best model result from grid search: %.3f"
% gs.score(X_test, y_test)))
您可以看到,我有ElasticNet,当我尝试在VotingClassifier中使用它时,它最终是一个回归器抛出错误。我意识到了这一点后便放弃了,但是从那以后,训练开始变得缓慢。我的意思是真的很慢,一项任务需要1分钟,有时甚至停留在任务15上。
numeric_transformer = Pipeline(steps=[
('simp_imp1', SimpleImputer()),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('simp_imp2', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
ordinal_tranformer = Pipeline(steps=[
('simp_imp3', SimpleImputer(strategy='most_frequent')),
('ordinal', OrdinalEncoder())])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
('ord', ordinal_tranformer, ordinal_features)],
remainder='passthrough')
clf1 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
clf2 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf3 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
clf4 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LinearSVC())])
clf5 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', KNeighborsClassifier())])
clf6 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())])
clf7 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', AdaBoostClassifier())])
clf8 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier())])
clf9 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', HistGradientBoostingClassifier())])
clf10 = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', MLPClassifier())])
vc=VotingClassifier(estimators=[('clf1',clf1) , ('clf2',clf2), ('clf3',clf3), ('clf4',clf4), ('clf5',clf5),
('clf6',clf6), ('clf7',clf7), ('clf8',clf8), ('clf9',clf9), ('clf10',clf10)])
params_grid = [{
'clf1__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf1__classifier__C': np.logspace(-3,3,7),
'clf1__classifier__penalty':["l1","l2"],
'clf1__classifier' : [LogisticRegression()]
},
{
'clf2__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf2__classifier__n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 100)],
'clf2__classifier__min_samples_split': [2, 5, 10],
'clf2__classifier__min_samples_leaf': [5,10,15],
'clf2__classifier' : [RandomForestClassifier()]
},
{
'clf3__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf3__classifier__C':[0.01,0.1, 1,10,100],
'clf3__classifier__gamma':[10,1,0.1,0.01, 0.001],
'clf3__classifier__kernel':['linear','rbf','poly'],
'clf3__classifier': [SVC()]
},
{
'clf4__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf4__classifier__C' : [0.01,0.1, 1,10,100],
'clf4__classifier__penalty': ['l1', 'l2'],
'clf4__classifier': [LinearSVC()]
},
{
'clf5__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf5__classifier__n_neighbors':np.arange(3,30,2),
'clf5__classifier__weights':['uniform', 'distance'],
'clf5__classifier__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
'clf5__classifier': [KNeighborsClassifier()]
},
{
'clf6__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf6__classifier__max_depth': np.arange(2, 8, 2),
'clf6__classifier__min_samples_split': [5,10,15,20,25,30],
'clf6__classifier__min_samples_leaf': [2,3,4,5,6,7,8,9],
'clf6__classifier': [DecisionTreeClassifier()]
},
{
'clf7__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf7__classifier__n_estimators': [50, 100, 150, 200],
'clf7__classifier__learning_rate': [0.001, 0.01, 0.1, 1],
'clf7__classifier__algorithm': ['SAMME', 'SAMME.R'],
'clf7__classifier': [AdaBoostClassifier()]
},
{
'clf8__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf8__classifier__learning_rate': [0.001, 0.01, 0.1, 1],
'clf8__classifier__min_samples_split': np.linspace(0.1, 0.5, 5),
'clf8__classifier__min_samples_leaf': np.linspace(0.1, 0.5, 5),
'clf8__classifier__max_depth':[3,4,5,6,7],
'clf8__classifier__n_estimators':[100,200],
'clf8__classifier': [GradientBoostingClassifier()]
},
{
'clf9__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'], #done
'clf9__classifier__learning_rate': [0.001, 0.01, 0.1, 1],
'clf9__classifier__max_iter': [100,300],
'clf9__classifier__min_samples_leaf': [20, 40, 60, 80, 100],
'clf9__classifier__l2_regularization': [0, 0.2, 0.4, 0.6, 0.8, 1],
'clf9__classifier': [HistGradientBoostingClassifier()]
},
{
'clf10__preprocessor__num__simp_imp1__strategy': ['mean', 'median', 'most_frequent'],
'clf10__classifier__hidden_layer_sizes': [(50,25,10), (50,25), (100,)],
'clf10__classifier__solver': ['adam'],
'clf10__classifier__alpha': [0.001, 0.01, 0.1, 1, 10],
'clf10__classifier__learning_rate': ['constant','adaptive'],
'clf10__classifier': [MLPClassifier()]
}
]
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ["Target"]), df["Target"], test_size=0.3)
grid_search = GridSearchCV(vc, params_grid, cv=5, scoring = 'f1', verbose=1, n_jobs=-1)
gs = grid_search.fit(X_train, y_train)
print(("best model result from grid search: %.3f"
% gs.score(X_test, y_test)))
这是我的第二个示例,它确实很慢。第一个花了大约30分钟,而这个花了15个任务。我不明白原因。
P.S。 :我在Google Colab上对其进行了培训