使用FeatureUnion的SKLearn管道工作很奇怪

时间:2017-09-07 21:00:39

标签: python pandas scikit-learn

我有一个数据集,在一列中我有原始文本数据,在其他类别中存储为文本。有点像这样:

u_category | rawtext
cat1         This is a category
cat2         This is category 2
cat1         This again belongs to category 1

构建一个管道,可以1)通过LabelEncoder将类别转换为整数,2)使用tf-idf向量化器对文本中的数据进行转换3)通过{{1}组合它们最后4)学会用FeatureUnion对它们进行分类。

当我拟合数据时,这很好用。但是,当我尝试在测试数据集上获得分数时,即使我检查LinearSVC值时,我也会收到错误,即已将新值引入y变量不包含新的价值观。似乎错误包含来自测试值集的y列的值,我无法弄清楚,为什么它会让事情变得混乱。总而言之,我已经为拟合和测试提供了相同的数据格式,但是,在测试时,代码似乎试图使用X中的数据而不是y作为目标类。

包含辅助函数和所有内容的完整代码:

rawtext

(它主要基于this教程,辅助函数解决了here所描述的一些问题。

到目前为止,一切都很好。然后我尝试做亵渎:

class MyLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.le = preprocessing.LabelEncoder()

    def fit(self, x, y=None):
        return self.le.fit(x)

    def transform(self, x, y=None):
        return self.le.transform(x).reshape(-1,1)

    def fit_transform(self, x, y=None):
        self.fit(x)
        return self.transform(x)

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

df_resolved.u_category = df_resolved.u_category.fillna("")

pipeline = Pipeline([

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[
            # Pipeline for pulling features from the post's subject line
            ('rawtext', Pipeline([
                ('selector', ItemSelector(key='u_category')),
                ('labelenc', MyLabelEncoder()),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('features', Pipeline([
                ('selector', ItemSelector(key='rawtext')),
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=1, 
                                          stop_words='english', 
                                          token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
            ])),
        ],

        # weight components in FeatureUnion
        transformer_weights={
            'rawtext': 1.0,
            'features': 1.0,
        },
    )),

    # Use a SVC classifier on the combined features
    ('linear_svc', LinearSVC(penalty="l2")),
])

X_train, X_test, y_train, y_test = train_test_split(df_resolved[['u_category','caller_id.country','rawtext']], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

然后一切都出错了。完整堆栈跟踪:

print(pipeline.score(X_test, y_test))

这看起来非常像来自 --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-80-c3e529f2bc17> in <module>() ----> 1 print(pipeline.score(X_test, y_test)) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs) 52 53 # lambda, but not partial, allows help() to work with update_wrapper ---> 54 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs) 55 # update the docstring of the returned function 56 update_wrapper(out, self.fn) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in score(self, X, y) 502 for name, transform in self.steps[:-1]: 503 if transform is not None: --> 504 Xt = transform.transform(Xt) 505 return self.steps[-1][-1].score(Xt, y) 506 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in transform(self, X) 761 Xs = Parallel(n_jobs=self.n_jobs)( 762 delayed(_transform_one)(trans, name, weight, X) --> 763 for name, trans, weight in self._iter()) 764 if not Xs: 765 # All transformers are None /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 756 # was dispatched. In particular this covers the edge 757 # case of Parallel used with an exhausted iterator. --> 758 while self.dispatch_one_batch(iterator): 759 self._iterating = True 760 else: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 606 return False 607 else: --> 608 self._dispatch(tasks) 609 return True 610 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 569 dispatch_timestamp = time.time() 570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 571 job = self._backend.apply_async(batch, callback=cb) 572 self._jobs.append(job) 573 /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 107 def apply_async(self, func, callback=None): 108 """Schedule a func to be run""" --> 109 result = ImmediateResult(func) 110 if callback: 111 callback(result) /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 324 # Don't delay the application, to avoid keeping the input 325 # arguments in memory --> 326 self.results = batch() 327 328 def get(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, name, weight, X) 565 566 def _transform_one(transformer, name, weight, X): --> 567 res = transformer.transform(X) 568 # if we have a weight for this transformer, multiply output 569 if weight is None: /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X) 443 for name, transform in self.steps: 444 if transform is not None: --> 445 Xt = transform.transform(Xt) 446 return Xt 447 <ipython-input-32-297ed049a40a> in transform(self, x, y) 7 8 def transform(self, x, y=None): ----> 9 return self.le.transform(x).reshape(-1,1) 10 11 def fit_transform(self, x, y=None): /Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in transform(self, y) 151 if len(np.intersect1d(classes, self.classes_)) < len(classes): 152 diff = np.setdiff1d(classes, self.classes_) --> 153 raise ValueError("y contains new labels: %s" % str(diff)) 154 return np.searchsorted(self.classes_, y) 155 ValueError: y contains new labels: ['Account Access' 'Alert Generation' 'Appworx' 'Capital EMEA (WCS BI Interfaces)' 'Client End - Calendar' 'DB Connection Issues' ... ... 'Transferred Case' 'Unavailable/Error' 'User Admin' 'Windows 7' 'production scheduling'] rawtext列的数据,它是一个pandas DataFrame,而不是X_test中的类,其中所有值都以@开头登录。

我搞砸了什么?如何在此设置中告知LinearSVC模型的分数?谢谢!

0 个答案:

没有答案