Custom FeatureUnion不起作用?

时间:2017-09-05 12:09:23

标签: python pandas scikit-learn

我尝试修改this示例以使用Pandas数据帧而不是测试数据集。我无法这样做,因为ItemSelector似乎无法识别列名。

请注意数据框df_resolved.columns的列返回:

Index(['u_category', ... ... 'resolution_time', 'rawtext'],
      dtype='object')

所以我显然在我的数据框中有这个。

但是,当我尝试运行解决方案时,我收到错误

  

" ValueError:没有名称为u_category"

的字段

此外,我似乎无法修改代码以支持在ItemSelector中选择多个列,因此在此解决方案中,我必须分别应用变换器列。

我的代码是:

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('subject', object), ('body', object)])
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features['body'][i] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features['subject'][i] = sub

        return features


pipeline = Pipeline([
    # Extract the subject & body
    ('subjectbody', SubjectBodyExtractor()),

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the post's subject line
            ('rawtext', Pipeline([
                ('selector', ItemSelector(key='u_category')),
                ('labelenc', preprocessing.LabelEncoder()),
            ])),
            # Pipeline for standard bag-of-words model for body
            ('features', Pipeline([
                ('selector', ItemSelector(key='rawtext')),
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=1, 
                                          stop_words='english', 
                                          token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
            ])),
        ],

        # weight components in FeatureUnion
        transformer_weights={
            'rawtext': 1.0,
            'features': 1.0,
        },
    )),

    # Use a SVC classifier on the combined features
    ('linear_svc', LinearSVC(penalty="l2")),
])

# limit the list of categories to make running this example faster.
X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))

如何修改此代码以便与我的数据帧一起正常工作,并且可能支持一次将变换器应用于多个列?

如果我取出ItemSelector,它似乎有效。所以这有效:

ds = ItemSelector(key='u_category')
ds.fit(df_resolved)

labelenc = preprocessing.LabelEncoder()
labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved))

完全堆积痕迹:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-93-a4ba29c137ec> in <module>()
    136 
    137 
--> 138 pipeline.fit(X_train, y_train)
    139 #y = pipeline.predict(X_test)
    140 #print(classification_report(y, test.target))

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
    575                        **fit_params):
    576     if hasattr(transformer, 'fit_transform'):
--> 577         res = transformer.fit_transform(X, y, **fit_params)
    578     else:
    579         res = transformer.fit(X, y, **fit_params).transform(X)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    299         """
    300         last_step = self._final_estimator
--> 301         Xt, fit_params = self._fit(X, y, **fit_params)
    302         if hasattr(last_step, 'fit_transform'):
    303             return last_step.fit_transform(Xt, y, **fit_params)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    495         else:
    496             # fit method of arity 2 (supervised transformation)
--> 497             return self.fit(X, y, **fit_params).transform(X)
    498 
    499 

<ipython-input-93-a4ba29c137ec> in transform(self, data_dict)
     55 
     56     def transform(self, data_dict):
---> 57         return data_dict[self.key]
     58 
     59 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx)
    497 
    498     def __getitem__(self, indx):
--> 499         obj = super(recarray, self).__getitem__(indx)
    500 
    501         # copy behavior of getattr, except that here

ValueError: no field of name u_category

更新

即使我使用数据框(否train_test_split),问题仍然存在:enter image description here

更新2: 好的,所以我删除了SubjectBodyExtractor,因为我不需要那个。现在ValueError: no field of name u_category已消失,但我遇到了一个新错误:TypeError: fit_transform() takes 2 positional arguments but 3 were given

堆栈追踪:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-110-292294015e44> in <module>()
    129 
    130 
--> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name'])
    132 #y = pipeline.predict(X_test)
    133 #print(classification_report(y, test.target))

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
    575                        **fit_params):
    576     if hasattr(transformer, 'fit_transform'):
--> 577         res = transformer.fit_transform(X, y, **fit_params)
    578     else:
    579         res = transformer.fit(X, y, **fit_params).transform(X)

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    301         Xt, fit_params = self._fit(X, y, **fit_params)
    302         if hasattr(last_step, 'fit_transform'):
--> 303             return last_step.fit_transform(Xt, y, **fit_params)
    304         elif last_step is None:
    305             return Xt

TypeError: fit_transform() takes 2 positional arguments but 3 were given

2 个答案:

答案 0 :(得分:0)

是的,那是因为LabelEncoder只需要一个数组y,而FeatureUnion会尝试向它发送X和y。

请参阅:https://github.com/scikit-learn/scikit-learn/issues/3956

您可以使用简单的解决方法:

定义一个自定义labelEncoder,如下所示:

class MyLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.le = LabelEncoder()

    def fit(self, x, y=None):
        return self.le.fit(x)

    def transform(self, x, y=None):
        return self.le.transform(x).reshape(-1,1)

    def fit_transform(self, x, y=None):
        self.fit(x)
        return self.transform(x)

在管道中,这样做:

....
....
                ('selector', ItemSelector(key='u_category')),
                ('labelenc', MyLabelEncoder()),

请注意trasform()方法中的重塑(-1,1)。这是因为FeatureUnion仅适用于二维数据。 FeatureUnion中的所有单个变换器应仅返回2-d数据。

答案 1 :(得分:-1)

你可能需要在这样的功能数组中添加它们,请尝试在这样的功能中添加两个选择器并向我显示结果

features = np.recarray(shape=(len(posts),),
                               dtype=[('u_category', object), ('rawtext', object)])