我为此大型代码块提前道歉。这是我提供可重复工作示例的最简洁方法。
在代码中,我尝试使用FeatureUnion
转换数据框中的两列,其中一列是文本数据TfidfVectorizer
,另一列是标记列表列,因此我想使用MultiLabelBinarizer
。
ItemSelector
转换器用于从数据帧中选择正确的列。
为什么我会TypeError: fit_transform() takes 2 positional arguments but 3 were given
?
我需要更改代码以使此示例正常运行?
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import pandas as pd
import numpy as np
d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help',
'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other',
'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon',
'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'],
'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']",
"['Coupons']", "['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']"],
'response': ['this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done', 'this is some text',
'this is some more text', 'and here is some more',
'and some more', 'and here we go some more yay done',
'this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done', 'this is some text',
'this is some more text', 'and here is some more',
'and some more', 'and here we go some more yay done',
'this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done']}
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, df):
return df[self.key]
feature_union = FeatureUnion(
transformer_list=[
('step1', Pipeline([
('selector', ItemSelector(key='response')),
('tfidf', TfidfVectorizer()),
])),
('step2', Pipeline([
('selector', ItemSelector(key='multilabels')),
('multilabel', MultiLabelBinarizer())
]))
])
pipeline = OneVsRestClassifier(
Pipeline([('union', feature_union),('sgd', SGDClassifier())])
)
grid = GridSearchCV(pipeline, {}, verbose=5)
df = pd.DataFrame(d, columns=['response', 'multilabels', 'label'])
X = df[['response', 'multilabels']]
y = df['label']
grid.fit(X, y)
这是完整的错误:
Traceback (most recent call last):
File "C:/Users/owner/Documents/my files/Account Tracking/Client/Foresee Analysis/SOQuestion.py", line 72, in <module>
grid.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 564, in _fit
for parameters in parameter_iterable
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\model_selection\_validation.py", line 238, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 216, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 80, in _fit_binary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 268, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 234, in _fit
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 734, in fit_transform
for name, trans, weight in self._iter())
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 577, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 303, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
注意:我查看了_transform() takes 2 positional arguments but 3 were given,但它对我来说仍然没有意义。
答案 0 :(得分:0)
知道了。制造另一个变压器来处理多标签二值化。这更像是一种解决方案而不是解决方案,因为二值化发生在转换而不是管道中。
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import pandas as pd
import numpy as np
d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help',
'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other',
'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon',
'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'],
'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']",
"['Coupons']", "['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']", "['Samples']", "['Deck']",
"['Deck', 'Deck Over', 'Stain']", "['Coupons']",
"['Bathroom']"],
'response': ['this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done', 'this is some text',
'this is some more text', 'and here is some more',
'and some more', 'and here we go some more yay done',
'this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done', 'this is some text',
'this is some more text', 'and here is some more',
'and some more', 'and here we go some more yay done',
'this is some text', 'this is some more text',
'and here is some more', 'and some more',
'and here we go some more yay done']}
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X[self.column]
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
mlb = MultiLabelBinarizer()
return mlb.fit_transform(X[self.column])
pipeline = OneVsRestClassifier(
Pipeline([
('union', FeatureUnion(
transformer_list=[
('step1', Pipeline([
('selector', ItemSelector(column='response')),
('tfidf', TfidfVectorizer())
])),
('step2', Pipeline([
('selector', MultiLabelTransformer(column='multilabels'))
]))
])),
('sgd', SGDClassifier())
])
)
grid = GridSearchCV(pipeline, {}, verbose=5)
df = pd.DataFrame(d, columns=['response', 'multilabels', 'label'])
df['multilabels'] = df['multilabels'].apply(lambda s: eval(s))
X = df[['response', 'multilabels']]
y = df['label']
grid.fit(X, y)