我尝试修改this示例以使用Pandas数据帧而不是测试数据集。我无法这样做,因为ItemSelector
似乎无法识别列名。
请注意数据框df_resolved.columns
的列返回:
Index(['u_category', ... ... 'resolution_time', 'rawtext'],
dtype='object')
所以我显然在我的数据框中有这个。
但是,当我尝试运行解决方案时,我收到错误
" ValueError:没有名称为u_category"
的字段
此外,我似乎无法修改代码以支持在ItemSelector
中选择多个列,因此在此解决方案中,我必须分别应用变换器列。
我的代码是:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
"""Extract the subject & body from a usenet post in a single pass.
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def transform(self, posts):
features = np.recarray(shape=(len(posts),),
dtype=[('subject', object), ('body', object)])
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
bod = strip_newsgroup_footer(bod)
bod = strip_newsgroup_quoting(bod)
features['body'][i] = bod
prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features['subject'][i] = sub
return features
pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
# Pipeline for pulling features from the post's subject line
('rawtext', Pipeline([
('selector', ItemSelector(key='u_category')),
('labelenc', preprocessing.LabelEncoder()),
])),
# Pipeline for standard bag-of-words model for body
('features', Pipeline([
('selector', ItemSelector(key='rawtext')),
('tfidf', TfidfVectorizer(max_df=0.5, min_df=1,
stop_words='english',
token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')),
])),
],
# weight components in FeatureUnion
transformer_weights={
'rawtext': 1.0,
'features': 1.0,
},
)),
# Use a SVC classifier on the combined features
('linear_svc', LinearSVC(penalty="l2")),
])
# limit the list of categories to make running this example faster.
X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))
如何修改此代码以便与我的数据帧一起正常工作,并且可能支持一次将变换器应用于多个列?
如果我取出ItemSelector
,它似乎有效。所以这有效:
ds = ItemSelector(key='u_category')
ds.fit(df_resolved)
labelenc = preprocessing.LabelEncoder()
labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved))
完全堆积痕迹:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-93-a4ba29c137ec> in <module>()
136
137
--> 138 pipeline.fit(X_train, y_train)
139 #y = pipeline.predict(X_test)
140 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
299 """
300 last_step = self._final_estimator
--> 301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
303 return last_step.fit_transform(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
<ipython-input-93-a4ba29c137ec> in transform(self, data_dict)
55
56 def transform(self, data_dict):
---> 57 return data_dict[self.key]
58
59
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx)
497
498 def __getitem__(self, indx):
--> 499 obj = super(recarray, self).__getitem__(indx)
500
501 # copy behavior of getattr, except that here
ValueError: no field of name u_category
更新
即使我使用数据框(否train_test_split
),问题仍然存在:
更新2:
好的,所以我删除了SubjectBodyExtractor
,因为我不需要那个。现在ValueError: no field of name u_category
已消失,但我遇到了一个新错误:TypeError: fit_transform() takes 2 positional arguments but 3 were given
。
堆栈追踪:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-110-292294015e44> in <module>()
129
130
--> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name'])
132 #y = pipeline.predict(X_test)
133 #print(classification_report(y, test.target))
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
732 delayed(_fit_transform_one)(trans, name, weight, X, y,
733 **fit_params)
--> 734 for name, trans, weight in self._iter())
735
736 if not result:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
575 **fit_params):
576 if hasattr(transformer, 'fit_transform'):
--> 577 res = transformer.fit_transform(X, y, **fit_params)
578 else:
579 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
301 Xt, fit_params = self._fit(X, y, **fit_params)
302 if hasattr(last_step, 'fit_transform'):
--> 303 return last_step.fit_transform(Xt, y, **fit_params)
304 elif last_step is None:
305 return Xt
TypeError: fit_transform() takes 2 positional arguments but 3 were given
答案 0 :(得分:0)
是的,那是因为LabelEncoder只需要一个数组y,而FeatureUnion会尝试向它发送X和y。
请参阅:https://github.com/scikit-learn/scikit-learn/issues/3956
您可以使用简单的解决方法:
定义一个自定义labelEncoder,如下所示:
class MyLabelEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.le = LabelEncoder()
def fit(self, x, y=None):
return self.le.fit(x)
def transform(self, x, y=None):
return self.le.transform(x).reshape(-1,1)
def fit_transform(self, x, y=None):
self.fit(x)
return self.transform(x)
在管道中,这样做:
....
....
('selector', ItemSelector(key='u_category')),
('labelenc', MyLabelEncoder()),
请注意trasform()
方法中的重塑(-1,1)。这是因为FeatureUnion仅适用于二维数据。 FeatureUnion中的所有单个变换器应仅返回2-d数据。
答案 1 :(得分:-1)
你可能需要在这样的功能数组中添加它们,请尝试在这样的功能中添加两个选择器并向我显示结果
features = np.recarray(shape=(len(posts),),
dtype=[('u_category', object), ('rawtext', object)])