我正在训练一个模型,为此,我需要一个属性选择器(带有RFECV),然后需要优化模型的参数(GridSearchCV)。
代码
model = LogisticRegression() #algorithm
my_scorer = make_scorer(score, greater_is_better=True) #The score
generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train) #Generator 10 splits with order
C= {'C': 10. ** np.arange(-3, 4)} #C
scaler = preprocessing.StandardScaler() #Standardized
selector =RFECV(cv=generador_train, estimator=model,scoring=my_scorer) #Selection of attributes
pipe=Pipeline([('scaler', scaler),('select', selector),('model', model)]) # The pipeline is created
grid = GridSearchCV(estimator=pipe, param_grid=C,cv=generador_train,scoring=my_scorer,refit=True) #The gridSearch with CV is declared
grid.fit(X_train, y_train) # The pipeline is executed
best_pipe=grid.best_estimator_
执行前面的代码时,我得到错误:
TypeError Traceback (most recent call last) <ipython-input-34-9d038a773283> in <module>() 17 18 grid = GridSearchCV(estimator=pipe, param_grid=C,cv=generador_train,scoring=my_scorer,refit=True) #Se declara el gridSearch con CV ---> 19 grid.fit(X_train,y_train) 20 best_pipe=grid.best_estimator_ AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params) 622 n_candidates * n_splits)) 623 --> 624 base_estimator = clone(self.estimator) 625 pre_dispatch = self.pre_dispatch 626 AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe) 59 new_object_params = estimator.get_params(deep=False) 60 for name, param in six.iteritems(new_object_params): ---> 61 new_object_params[name] = clone(param, safe=False) 62 new_object = klass(**new_object_params) 63 params_set = new_object.get_params(deep=False) AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe) 47 # XXX: not handling dictionaries 48 if estimator_type in (list, tuple, set, frozenset): ---> 49 return estimator_type([clone(e, safe=safe) for e in estimator]) 50 elif not hasattr(estimator, 'get_params'): 51 if not safe: AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0) 47 # XXX: not handling dictionaries 48 if estimator_type in (list, tuple, set, frozenset): ---> 49 return estimator_type([clone(e, safe=safe) for e in estimator]) 50 elif not hasattr(estimator, 'get_params'): 51 if not safe: AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe) 47 # XXX: not handling dictionaries 48 if estimator_type in (list, tuple, set, frozenset): ---> 49 return estimator_type([clone(e, safe=safe) for e in estimator]) 50 elif not hasattr(estimator, 'get_params'): 51 if not safe: AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0) 47 # XXX: not handling dictionaries 48 if estimator_type in (list, tuple, set, frozenset): ---> 49 return estimator_type([clone(e, safe=safe) for e in estimator]) 50 elif not hasattr(estimator, 'get_params'): 51 if not safe: AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe) 59 new_object_params = estimator.get_params(deep=False) 60 for name, param in six.iteritems(new_object_params): ---> 61 new_object_params[name] = clone(param, safe=False) 62 new_object = klass(**new_object_params) 63 params_set = new_object.get_params(deep=False) AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe) 50 elif not hasattr(estimator, 'get_params'): 51 if not safe: ---> 52 return copy.deepcopy(estimator) 53 else: 54 raise TypeError("Cannot clone object '%s' (type %s): " AppData\Local\Continuum\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil) 167 reductor = getattr(x, "__reduce_ex__", None) 168 if reductor: --> 169 rv = reductor(4) 170 else: 171 reductor = getattr(x, "__reduce__", None) TypeError: can't pickle generator objects
如何解决?可能是哪个原因?
更新1
我已经提出:
list(generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train))
但是我得到了这个错误:
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-150-d0ca294b7811> in <module>() 25 26 grid = GridSearchCV(estimator=pipe, param_grid=C, cv=generador_train,scoring=my_scorer,refit=True) #Se declara el gridSearch con CV ---> 27 grid.fit(X_train, y_train) # Se ejecuta la pipeline 28 #grid.fit(digits.data, digits.target) 29 #res=pipe.named_steps['select'].grid_scores_ #Resultados gridSearch ~\Anaconda4\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params) 637 error_score=self.error_score) 638 for parameters, (train, test) in product(candidate_params, --> 639 cv.split(X, y, groups))) 640 641 # if one choose to see train score, "out" will contain train score info ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 777 # was dispatched. In particular this covers the edge 778 # case of Parallel used with an exhausted iterator. --> 779 while self.dispatch_one_batch(iterator): 780 self._iterating = True 781 else: ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 623 return False 624 else: --> 625 self._dispatch(tasks) 626 return True 627 ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 586 dispatch_timestamp = time.time() 587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 588 job = self._backend.apply_async(batch, callback=cb) 589 self._jobs.append(job) 590 ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback) 109 def apply_async(self, func, callback=None): 110 """Schedule a func to be run""" --> 111 result = ImmediateResult(func) 112 if callback: 113 callback(result) ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch) 330 # Don't delay the application, to avoid keeping the input 331 # arguments in memory --> 332 self.results = batch() 333 334 def get(self): ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~\Anaconda4\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score) 456 estimator.fit(X_train, **fit_params) 457 else: --> 458 estimator.fit(X_train, y_train, **fit_params) 459 460 except Exception as e: ~\Anaconda4\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params) 246 This estimator 247 """ --> 248 Xt, fit_params = self._fit(X, y, **fit_params) 249 if self._final_estimator is not None: 250 self._final_estimator.fit(Xt, y, **fit_params) ~\Anaconda4\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params) 211 Xt, fitted_transformer = fit_transform_one_cached( 212 cloned_transformer, None, Xt, y, --> 213 **fit_params_steps[name]) 214 # Replace the transformer of the step with the fitted 215 # transformer. This is necessary when loading the transformer ~\Anaconda4\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs) 360 361 def __call__(self, *args, **kwargs): --> 362 return self.func(*args, **kwargs) 363 364 def call_and_shelve(self, *args, **kwargs): ~\Anaconda4\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params) 579 **fit_params): 580 if hasattr(transformer, 'fit_transform'): --> 581 res = transformer.fit_transform(X, y, **fit_params) 582 else: 583 res = transformer.fit(X, y, **fit_params).transform(X) ~\Anaconda4\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params) 518 else: 519 # fit method of arity 2 (supervised transformation) --> 520 return self.fit(X, y, **fit_params).transform(X) 521 522 ~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in fit(self, X, y) 434 scores = parallel( 435 func(rfe, self.estimator, X, y, train, test, scorer) --> 436 for train, test in cv.split(X, y)) 437 438 scores = np.sum(scores, axis=0) ~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in <genexpr>(.0) 434 scores = parallel( 435 func(rfe, self.estimator, X, y, train, test, scorer) --> 436 for train, test in cv.split(X, y)) 437 438 scores = np.sum(scores, axis=0) ~\Anaconda4\lib\site-packages\sklearn\feature_selection\rfe.py in _rfe_single_fit(rfe, estimator, X, y, train, test, scorer) 26 Return the score for a fit across one fold. 27 """ ---> 28 X_train, y_train = _safe_split(estimator, X, y, train) 29 X_test, y_test = _safe_split(estimator, X, y, test, train) 30 return rfe._fit( ~\Anaconda4\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices) 198 X_subset = X[np.ix_(indices, train_indices)] 199 else: --> 200 X_subset = safe_indexing(X, indices) 201 202 if y is not None: ~\Anaconda4\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices) 158 indices.dtype.kind == 'i'): 159 # This is often substantially faster than X[indices] --> 160 return X.take(indices, axis=0) 161 else: 162 return X[indices] IndexError: index 182 is out of bounds for size 182
我在做什么错了?
更新2
有2个具有相同ID的图元,当我拆分数据时无法将它们分开。
Order_train的创建:
order = mydata.iloc[:,0].values #Ids that are used by the order.
train_indices, test_indices = next(GroupShuffleSplit(test_size=0.25).split(X, y, order)) #Split the data into train and test using groups.
X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], y[train_indices], y[test_indices] #Obtain the 4 datasets
order_train=mydata.iloc[train_indices,0].values #Order train
Order_train:
[ 1. 1. 2. 2. 3. 3. 4. 4. 5. 5. 6. 6.
7. 7. 8. 8. 9. 9. 10. 10. 11. 11. 12. 12.
13. 13. 14. 14. 15. 15. 16. 16. 17. 17. 18. 18.
19. 19. 20. 20. 21. 21. 22. 22. 23. 23. 24. 24.
25. 25. 26. 26. 27. 27. 28. 28. 29. 29. 30. 30.
31. 31. 32. 32. 33. 33. 34. 34. 35. 35. 36. 36.
37. 37. 38. 38. 39. 39. 40. 40. 41. 41. 42. 42.
43. 43. 44. 44. 45. 45. 46. 46. 47. 47. 48. 48.
49. 49. 50. 50. 51. 51. 52. 52. 53. 53. 54. 54.
55. 55. 56. 56. 57. 57. 58. 58. 59. 59. 60. 60.
61. 61. 62. 62. 63. 63. 64. 64. 65. 65. 66. 66.
67. 67. 68. 68. 69. 69. 70. 70. 71. 71. 72. 72.
73. 73. 74. 74. 75. 75. 76. 76. 77. 77. 78. 78.
79. 79. 80. 80. 81. 81. 82. 82. 83. 83. 84. 84.
85. 85. 86. 86. 87. 87. 88. 88. 89. 89. 90. 90.
91. 91. 92. 92. 93. 93. 94. 94. 95. 95. 96. 96.
97. 97. 98. 98. 99. 99. 100. 100. 101. 101. 102. 102.]
答案 0 :(得分:0)
当您说"I need that GroupKFold makes the splits using an order."
时,我假设您是在谈论将groups = order_train
传递到GroupKFold中。您可以将其传递给GridSearchCV.fit()
。它将自动传递到GroupKFold以按您想要的顺序获取拆分。
当前您正在尝试做:
generador_train = GroupKFold(n_splits=10).split(X_train, y_train, order_train)
grid = GridSearchCV(estimator=pipe,
param_grid=C,
cv=generador_train,
scoring=my_scorer,
refit=True)
grid.fit(X_train, y_train)
我的意思是你可以这样做:
grid = GridSearchCV(estimator=pipe,
param_grid=C,
cv=GroupKFold(n_splits=10),
scoring=my_scorer,
refit=True)
grid.fit(X_train, y_train, order_train)