解开sklearn管道

时间:2017-10-02 09:31:40

标签: python scikit-learn sklearn-pandas

我正在尝试创建一个简单的管道来将分类数据转换为单热矢量,不幸的是它失败了,因为出于某种原因,数据需要事先成为ravel()。

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attributes)),
        ('rave', ravel()), # <- this function need to ravel the data before going into LabelEncoder()
        ('cat_encoder_label', LabelEncoder()),
        ('cat_encoder_hot', OneHotEncoder())
    ])

我输入的数据如下所示:

X.head()
1    Geuzenveld - Slotermeer
2    Westerpark
3    Noord-Oost
4    IJburg - Zeeburgereiland
5    De Aker - Nieuw Sloten

X.type()
numpy.ndarray

Xt = cat_pipeline.fit_transform(X)
- FAILS

我该如何创建此功能?

编辑,这是错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-173-4876cd8a7bee> in <module>()
     17 #data['neighbourhood_cleansed'].values.ravel().shape, data['neighbourhood_cleansed'].values.shape
     18 
---> 19 c = cat_pipeline.fit_transform(data)

~\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    288         """
    289         last_step = self._final_estimator
--> 290         Xt, fit_params = self._fit(X, y, **fit_params)
    291         if hasattr(last_step, 'fit_transform'):
    292             return last_step.fit_transform(Xt, y, **fit_params)

~\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    220                 Xt, fitted_transformer = fit_transform_one_cached(
    221                     cloned_transformer, None, Xt, y,
--> 222                     **fit_params_steps[name])
    223                 # Replace the transformer of the step with the fitted
    224                 # transformer. This is necessary when loading the transformer

~\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    587                        **fit_params):
    588     if hasattr(transformer, 'fit_transform'):
--> 589         res = transformer.fit_transform(X, y, **fit_params)
    590     else:
    591         res = transformer.fit(X, y, **fit_params).transform(X)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

0 个答案:

没有答案