我正在尝试将gridsearchCV用于我的多类分类。由于我的数据不平衡,因此将AUC_ROC用作记分器。随机森林用作学习者。对于准确性,精度和查全率等其他指标没有问题,但是对于ROC_AUC来说是有问题的。它返回此错误,“ ValueError:不支持多类格式”
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
X, y = make_classification(n_samples = 500,
# ten features
n_features = 8,
# five features that actually predict the output's classes
n_informative = 4,
# five features that are random and unrelated to the output's classes
n_redundant = 3,
# three output classes
n_classes = 4,
# with 20% of observations in the first class, 30% in the second class,
# and 50% in the third class. ('None' makes balanced classes)
weights = [.05, .10, .2, .65])
# Candidate model
model1 = Pipeline([
('clf', RandomForestClassifier())
])
# List of Hyperparameter
param_grid = {
'clf__n_estimators': [100, 200, 300, 1000]
}
# Gridsearch for Hyperparamater
scores = ['roc_auc', 'f1_macro']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
kf = StratifiedKFold(n_splits=5)
gscv = GridSearchCV(model1, param_grid, cv=kf,
scoring=score)
gscv.fit(X, y)
我已经尝试使用LabelBinarizer()对目标进行二值化处理。但是随后出现另一个错误,ValueError:支持的目标类型为:(“ binary”,“ multiclass”)。取而代之的是“ multilabel-indicator”。
# Binarize the output
lb = LabelBinarizer()
y = lb.fit_transform(y)
让此AUC_ROC用于多类gridsearchCV的可行方法是什么?
下面是完整的堆栈跟踪错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-93d61359c5d3> in <module>
46 gscv = GridSearchCV(model1, param_grid, cv=kf,
47 scoring=score)
---> 48 gscv.fit(X, y)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py in split(self, X, y, groups)
329 .format(self.n_splits, n_samples))
330
--> 331 for train, test in super(_BaseKFold, self).split(X, y, groups):
332 yield train, test
333
~\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py in split(self, X, y, groups)
98 X, y, groups = indexable(X, y, groups)
99 indices = np.arange(_num_samples(X))
--> 100 for test_index in self._iter_test_masks(X, y, groups):
101 train_index = indices[np.logical_not(test_index)]
102 test_index = indices[test_index]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py in _iter_test_masks(self, X, y, groups)
679
680 def _iter_test_masks(self, X, y=None, groups=None):
--> 681 test_folds = self._make_test_folds(X, y)
682 for i in range(self.n_splits):
683 yield test_folds == i
~\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py in _make_test_folds(self, X, y)
634 raise ValueError(
635 'Supported target types are: {}. Got {!r} instead.'.format(
--> 636 allowed_target_types, type_of_target_y))
637
638 y = column_or_1d(y)
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.