带有Python的GridsearchCV和RFECV

时间:2018-09-09 21:40:28

标签: python pipeline grid-search rfe

我正在训练一个模型,为此,我需要一个属性选择器(带有RFECV),然后需要优化模型的参数(GridSearchCV)。

代码

model = LogisticRegression() #algorithm

my_scorer = make_scorer(score, greater_is_better=True) #The score

generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train) #Generator 10 splits with order
C= {'C': 10. ** np.arange(-3, 4)} #C
scaler = preprocessing.StandardScaler() #Standardized
selector =RFECV(cv=generador_train, estimator=model,scoring=my_scorer) #Selection of attributes

pipe=Pipeline([('scaler', scaler),('select', selector),('model', model)]) # The pipeline is created


grid = GridSearchCV(estimator=pipe, param_grid=C,cv=generador_train,scoring=my_scorer,refit=True) #The gridSearch with CV is declared

grid.fit(X_train, y_train) # The pipeline is executed        
best_pipe=grid.best_estimator_

执行前面的代码时,我得到错误:

TypeError                                 Traceback (most recent call
last) <ipython-input-34-9d038a773283> in <module>()
     17 
     18     grid = GridSearchCV(estimator=pipe, param_grid=C,cv=generador_train,scoring=my_scorer,refit=True) #Se
declara el gridSearch con CV
---> 19     grid.fit(X_train,y_train)
     20     best_pipe=grid.best_estimator_

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py
in fit(self, X, y, groups, **fit_params)
    622                                      n_candidates * n_splits))
    623 
--> 624         base_estimator = clone(self.estimator)
    625         pre_dispatch = self.pre_dispatch
    626 

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in clone(estimator, safe)
     59     new_object_params = estimator.get_params(deep=False)
     60     for name, param in six.iteritems(new_object_params):
---> 61         new_object_params[name] = clone(param, safe=False)
     62     new_object = klass(**new_object_params)
     63     params_set = new_object.get_params(deep=False)

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in clone(estimator, safe)
     47     # XXX: not handling dictionaries
     48     if estimator_type in (list, tuple, set, frozenset):
---> 49         return estimator_type([clone(e, safe=safe) for e in estimator])
     50     elif not hasattr(estimator, 'get_params'):
     51         if not safe:

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in <listcomp>(.0)
     47     # XXX: not handling dictionaries
     48     if estimator_type in (list, tuple, set, frozenset):
---> 49         return estimator_type([clone(e, safe=safe) for e in estimator])
     50     elif not hasattr(estimator, 'get_params'):
     51         if not safe:

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in clone(estimator, safe)
     47     # XXX: not handling dictionaries
     48     if estimator_type in (list, tuple, set, frozenset):
---> 49         return estimator_type([clone(e, safe=safe) for e in estimator])
     50     elif not hasattr(estimator, 'get_params'):
     51         if not safe:

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in <listcomp>(.0)
     47     # XXX: not handling dictionaries
     48     if estimator_type in (list, tuple, set, frozenset):
---> 49         return estimator_type([clone(e, safe=safe) for e in estimator])
     50     elif not hasattr(estimator, 'get_params'):
     51         if not safe:

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in clone(estimator, safe)
     59     new_object_params = estimator.get_params(deep=False)
     60     for name, param in six.iteritems(new_object_params):
---> 61         new_object_params[name] = clone(param, safe=False)
     62     new_object = klass(**new_object_params)
     63     params_set = new_object.get_params(deep=False)

AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py
in clone(estimator, safe)
     50     elif not hasattr(estimator, 'get_params'):
     51         if not safe:
---> 52             return copy.deepcopy(estimator)
     53         else:
     54             raise TypeError("Cannot clone object '%s' (type %s): "

AppData\Local\Continuum\Anaconda3\lib\copy.py in
deepcopy(x, memo, _nil)
    167                     reductor = getattr(x, "__reduce_ex__", None)
    168                     if reductor:
--> 169                         rv = reductor(4)
    170                     else:
    171                         reductor = getattr(x, "__reduce__", None)

TypeError: can't pickle generator objects

如何解决?可能是哪个原因?

更新1

我已经提出:

list(generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train))

但是我得到了这个错误:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-150-d0ca294b7811> in <module>()
 25 
 26 grid = GridSearchCV(estimator=pipe, param_grid=C, cv=generador_train,scoring=my_scorer,refit=True) #Se declara el gridSearch con  CV
---> 27 grid.fit(X_train, y_train) # Se ejecuta la pipeline
 28 #grid.fit(digits.data, digits.target)
 29 #res=pipe.named_steps['select'].grid_scores_ #Resultados gridSearch

~\Anaconda4\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
637                                   error_score=self.error_score)
638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
640 
641         # if one choose to see train score, "out" will contain train score info

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in     __call__(self, iterable)
777             # was dispatched. In particular this covers the edge
778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
780                 self._iterating = True
781             else:

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in  dispatch_one_batch(self, iterator)
623                 return False
624             else:
--> 625                 self._dispatch(tasks)
626                 return True
627 

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586         dispatch_timestamp = time.time()
587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
589         self._jobs.append(job)
590 

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109     def apply_async(self, func, callback=None):
110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
112         if callback:
113             callback(result)

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330         # Don't delay the application, to avoid keeping the input
331         # arguments in memory
--> 332         self.results = batch()
333 
334     def get(self):

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129 
130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
132 
133     def __len__(self):

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129 
130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
132 
133     def __len__(self):

~\Anaconda4\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
456             estimator.fit(X_train, **fit_params)
457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
459 
460     except Exception as e:

~\Anaconda4\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
246             This estimator
247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
249         if self._final_estimator is not None:
250             self._final_estimator.fit(Xt, y, **fit_params)

~\Anaconda4\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
211                 Xt, fitted_transformer = fit_transform_one_cached(
212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
214                 # Replace the transformer of the step with the fitted
215                 # transformer. This is necessary when loading the transformer

~\Anaconda4\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
360 
361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
363 
364     def call_and_shelve(self, *args, **kwargs):

~\Anaconda4\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579                        **fit_params):
580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
582     else:
583         res = transformer.fit(X, y, **fit_params).transform(X)

~\Anaconda4\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
518         else:
519             # fit method of arity 2 (supervised transformation)
--> 520             return self.fit(X, y, **fit_params).transform(X)
521 
522 

~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in fit(self, X, y)
434         scores = parallel(
435             func(rfe, self.estimator, X, y, train, test, scorer)
--> 436             for train, test in cv.split(X, y))
437 
438         scores = np.sum(scores, axis=0)

~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in <genexpr>(.0)
434         scores = parallel(
435             func(rfe, self.estimator, X, y, train, test, scorer)
--> 436             for train, test in cv.split(X, y))
437 
438         scores = np.sum(scores, axis=0)

~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in _rfe_single_fit(rfe, estimator, X, y, train, test, scorer)
 26     Return the score for a fit across one fold.
 27     """
 ---> 28     X_train, y_train = _safe_split(estimator, X, y, train)
 29     X_test, y_test = _safe_split(estimator, X, y, test, train)
 30     return rfe._fit(

 ~\Anaconda4\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198             X_subset = X[np.ix_(indices, train_indices)]
199     else:
--> 200         X_subset = safe_indexing(X, indices)
201 
202     if y is not None:

~\Anaconda4\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
158                                    indices.dtype.kind == 'i'):
159             # This is often substantially faster than X[indices]
--> 160             return X.take(indices, axis=0)
161         else:
162             return X[indices]

IndexError: index 182 is out of bounds for size 182

我在做什么错了?

更新2

有2个具有相同ID的图元,当我拆分数据时无法将它们分开。

Order_train的创建:

order = mydata.iloc[:,0].values #Ids that are used by the order.  
train_indices, test_indices = next(GroupShuffleSplit(test_size=0.25).split(X, y, order)) #Split the data into train and test using groups.
X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], y[train_indices], y[test_indices] #Obtain the 4 datasets
order_train=mydata.iloc[train_indices,0].values #Order train

Order_train:

[  1.   1.   2.   2.   3.   3.   4.   4.   5.   5.   6.   6.
7.   7.   8.   8.   9.   9.   10.   10.   11.   11.   12.   12.
13.   13.   14.   14.   15.   15.   16.   16.   17.   17.   18.   18.
19.   19.   20.   20.   21.   21.   22.   22.   23.   23.   24.   24.
25.   25.   26.   26.   27.   27.   28.   28.   29.   29.   30.   30.
31.   31.   32.   32.   33.   33.   34.   34.   35.   35.   36.   36.
37.   37.   38.   38.   39.   39.   40.   40.   41.   41.   42.   42.
43.   43.   44.   44.   45.   45.   46.   46.   47.   47.   48.   48.
49.   49.   50.   50.   51.   51.   52.   52.  53.  53.  54.  54.
55.  55.  56.  56.  57.  57.  58.  58.  59.  59.  60.  60.
61.  61.  62.  62.  63.  63.  64.  64.  65.  65.  66.  66.
67.  67.  68.  68.  69.  69.  70.  70.  71.  71.  72.  72.
73.  73.  74.  74.  75.  75.  76.  76.  77.  77.  78.  78.
79.  79.  80.  80.  81.  81.  82.  82.  83.  83.  84.  84.
85.  85.  86.  86.  87.  87.  88.  88.  89.  89.  90.  90.
91.  91.  92.  92.  93.  93.  94.  94.  95.  95.  96.  96.
97.  97.  98.  98.  99.  99.  100.  100.  101.  101.  102.  102.]

1 个答案:

答案 0 :(得分:0)

当您说"I need that GroupKFold makes the splits using an order."时,我假设您是在谈论将groups = order_train传递到GroupKFold中。您可以将其传递给GridSearchCV.fit()。它将自动传递到GroupKFold以按您想要的顺序获取拆分。

当前您正在尝试做:

generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train) 

grid = GridSearchCV(estimator=pipe, 
                   param_grid=C,
                   cv=generador_train,
                   scoring=my_scorer,
                   refit=True) 

grid.fit(X_train, y_train)

我的意思是你可以这样做:

grid = GridSearchCV(estimator=pipe, 
                   param_grid=C,
                   cv=GroupKFold(n_splits=10),
                   scoring=my_scorer,
                   refit=True) 

grid.fit(X_train, y_train, order_train)