我有一个复杂的错误,无法解释,也无法弄清楚。显然,当简化模型拟合时,我的预处理管道有效,但是当我尝试进行交叉验证时,预处理管道失败。我无法破译错误,也无法理解问题。请帮忙。
我创建了一个管道,该管道对数据执行一些预处理任务。有用。包括一些客户变压器。这是代码。
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.cols = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.cols]
class dummy_creator(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
X_categorical_scaled_df = pd.get_dummies(X)
return X_categorical_scaled_df
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self):
self.ss = None
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
pipeline_categorical = Pipeline(steps = [
('column_selector', column_selector(categorical_features)),
('create_dummies', dummy_creator())
])
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('scaler',DFStandardScaler())
])
feature_union = FeatureUnion([('cat', pipeline_categorical),
('cont', pipeline_continuous)])
如果我fit_transform
管道中,我会得到很好的结果:
X_train_enc = feature_union.fit_transform(X_train)
X_train_enc
>>>array([[ 0. , 1. , 0. , ..., -0.05977797,
-0.21011127, -0.24460191],
[ 1. , 0. , 0. , ..., -0.68765273,
-0.00946558, -0.82457039],
[ 0. , 1. , 0. , ..., -1.06122696,
如果我现在使用上述预处理管道和模型(在本例中为线性回归)建立管道,那么我仍然会获得良好的结果(仅显示如下所示的预测值即可正确预处理数据并进行模型拟合):>
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(feature_union, LinearRegression())
pipe.fit(X_train, y_train)
pipe.predict(X_validation)
>>>array([ 9.17773438, 9.38226318, 8.35693359, 10.62176514, 11.29095459,
7.45025635, 6.03497314, 10.04321289, 10.57568359, 9.86663818,
7.01202393, 8.08374023, 8.80700684, 10.80102539, 12.32678223,
6.7588501 , 10.44604492, 6.86547852, 9.20465088, 9.04406738,
现在,我尝试使用交叉验证来测试相同的模型。您会注意到,我将管道放入了一个列表(“管道”)中,并在循环中进行了交叉验证。这是因为我打算使用不同的模型创建与此类似的管道列表并遍历它们,但这超出了我的讨论范围(但以防万一,您可能想知道为什么我用这种方式进行编码)
seed = 7
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('Preprocess', feature_union),('LR', LinearRegression())])))
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(), cv_results.std()))
然后出现以下错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2655 try:
-> 2656 return self._engine.get_loc(key)
2657 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-70-1aa4a50ac843> in <module>
29
30 kfold=KFold(n_splits=10, random_state=7)
---> 31 cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
32 results.append(cv_results)
33 names.append(name)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
791 delayed(_fit_transform_one)(trans, X, y, weight,
792 **fit_params)
--> 793 for name, trans, weight in self._iter())
794
795 if not result:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
296 """
297 last_step = self._final_estimator
--> 298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
300 return last_step.fit_transform(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
463 else:
464 # fit method of arity 2 (supervised transformation)
--> 465 return self.fit(X, y, **fit_params).transform(X)
466
467
<ipython-input-24-666c2228e73d> in transform(self, X, y)
13
14 def transform(self, X, y=None):
---> 15 return X.loc[:, self.cols]
16
17
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
986 for i, key in enumerate(tup):
987 if is_label_like(key) or isinstance(key, tuple):
--> 988 section = self._getitem_axis(key, axis=i)
989
990 # we have yielded a scalar ?
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3574
3575 if axis == 1:
-> 3576 return self[key]
3577
3578 self._consolidate_inplace()
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 return self._engine.get_loc(key)
2657 except KeyError:
-> 2658 return self._engine.get_loc(self._maybe_cast_indexer(key))
2659 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2660 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
该错误似乎位于预处理功能部件中-但我不确定确切的位置或原因。我相信它可能在pd.get_dummies()函数周围的create_dummies类中,但不确定。
有人可以告知发生了什么事吗?
答案 0 :(得分:1)
我知道这是一个老问题,但是当我遇到同样的问题(代码在没有交叉验证的情况下工作并因此失败)时,它是第一个出现的问题。
失败的原因是您的自定义估算器 column_selector
的签名与属性不匹配。来自 sklearn documentation 关于开发估计器:
此外,__init__ 接受的每个关键字参数都应该对应于实例上的一个属性。在进行模型选择时,Scikit-learn 依赖于此找到要在估计器上设置的相关属性。
如果您将其更改为:
应该可以正常工作class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.columns = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.columns]
答案 1 :(得分:0)
经过一天半的脑力衰竭,我创建了一个解决方法。
我从包含特征联合和模型的最终管道中删除了预处理特征联合。我在每个管道循环内的训练集上运行功能并集,然后在交叉编码中调用模型,同时输入编码/转换后的变量。这是代码。
from sklearn.pipeline import Pipeline
seed = 7
#prepare Models
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('LR',LinearRegression())])))
#evaluate each model
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
X_train_enc = feature_union.fit_transform(X_train)
y_train = Y_train.values
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train_enc, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(),cv_results.std()))
当然,从技术上讲,我不需要简单调用模型的最终管道。但是,由于此工作仍在进行中,因此我暂时保留,我可能会再进行一些其他操作。