我有一个数据集,在一列中我有原始文本数据,在其他类别中存储为文本。有点像这样:
u_category | rawtext
cat1 This is a category
cat2 This is category 2
cat1 This again belongs to category 1
构建一个管道,可以1)通过LabelEncoder
将类别转换为整数,2)使用tf-idf向量化器对文本中的数据进行转换3)通过{{1}组合它们最后4)学会用FeatureUnion
对它们进行分类。
当我拟合数据时,这很好用。但是,当我尝试在测试数据集上获得分数时,即使我检查LinearSVC
值时,我也会收到错误,即已将新值引入y
变量不包含新的价值观。似乎错误包含来自测试值集的y
列的值,我无法弄清楚,为什么它会让事情变得混乱。总而言之,我已经为拟合和测试提供了相同的数据格式,但是,在测试时,代码似乎试图使用X中的数据而不是y作为目标类。
包含辅助函数和所有内容的完整代码:
rawtext
(它主要基于this教程,辅助函数解决了here所描述的一些问题。
到目前为止,一切都很好。然后我尝试做亵渎:
class MyLabelEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.le = preprocessing.LabelEncoder()
def fit(self, x, y=None):
return self.le.fit(x)
def transform(self, x, y=None):
return self.le.transform(x).reshape(-1,1)
def fit_transform(self, x, y=None):
self.fit(x)
return self.transform(x)
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
df_resolved.u_category = df_resolved.u_category.fillna("")
pipeline = Pipeline([
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
# Pipeline for pulling features from the post's subject line
('rawtext', Pipeline([
('selector', ItemSelector(key='u_category')),
('labelenc', MyLabelEncoder()),
])),
# Pipeline for standard bag-of-words model for body
('features', Pipeline([
('selector', ItemSelector(key='rawtext')),
('tfidf', TfidfVectorizer(max_df=0.5, min_df=1,
stop_words='english',
token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'rawtext': 1.0,
'features': 1.0,
},
)),
# Use a SVC classifier on the combined features
('linear_svc', LinearSVC(penalty="l2")),
])
X_train, X_test, y_train, y_test = train_test_split(df_resolved[['u_category','caller_id.country','rawtext']], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
然后一切都出错了。完整堆栈跟踪:
print(pipeline.score(X_test, y_test))
这看起来非常像来自 ---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-80-c3e529f2bc17> in <module>()
----> 1 print(pipeline.score(X_test, y_test))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
52
53 # lambda, but not partial, allows help() to work with update_wrapper
---> 54 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
55 # update the docstring of the returned function
56 update_wrapper(out, self.fn)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in score(self, X, y)
502 for name, transform in self.steps[:-1]:
503 if transform is not None:
--> 504 Xt = transform.transform(Xt)
505 return self.steps[-1][-1].score(Xt, y)
506
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in transform(self, X)
761 Xs = Parallel(n_jobs=self.n_jobs)(
762 delayed(_transform_one)(trans, name, weight, X)
--> 763 for name, trans, weight in self._iter())
764 if not Xs:
765 # All transformers are None
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, name, weight, X)
565
566 def _transform_one(transformer, name, weight, X):
--> 567 res = transformer.transform(X)
568 # if we have a weight for this transformer, multiply output
569 if weight is None:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
443 for name, transform in self.steps:
444 if transform is not None:
--> 445 Xt = transform.transform(Xt)
446 return Xt
447
<ipython-input-32-297ed049a40a> in transform(self, x, y)
7
8 def transform(self, x, y=None):
----> 9 return self.le.transform(x).reshape(-1,1)
10
11 def fit_transform(self, x, y=None):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in transform(self, y)
151 if len(np.intersect1d(classes, self.classes_)) < len(classes):
152 diff = np.setdiff1d(classes, self.classes_)
--> 153 raise ValueError("y contains new labels: %s" % str(diff))
154 return np.searchsorted(self.classes_, y)
155
ValueError: y contains new labels: ['Account Access' 'Alert Generation' 'Appworx'
'Capital EMEA (WCS BI Interfaces)' 'Client End - Calendar'
'DB Connection Issues' ... ... 'Transferred Case' 'Unavailable/Error' 'User Admin' 'Windows 7' 'production scheduling']
rawtext
列的数据,它是一个pandas DataFrame,而不是X_test
中的类,其中所有值都以@开头登录。
我搞砸了什么?如何在此设置中告知LinearSVC模型的分数?谢谢!