我做如下
import pandas as pd
from sklearn import preprocessing
import sklearn
from sklearn.pipeline import Pipeline
df = pd.DataFrame({'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
encoding_pipeline =Pipeline([
('LabelEncoder', preprocessing.LabelEncoder())
])
encoding_pipeline.fit_transform(df)
和完整的追溯
TypeError Traceback (most recent call last)
<ipython-input-7-0882633ccf59> in <module>()
----> 1 encoding_pipeline.fit_transform(df)
C:\Program Files\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
183 Xt, fit_params = self._pre_transform(X, y, **fit_params)
184 if hasattr(self.steps[-1][-1], 'fit_transform'):
--> 185 return self.steps[-1][-1].fit_transform(Xt, y, **fit_params)
186 else:
187 return self.steps[-1][-1].fit(Xt, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
出了什么问题?看起来我必须在应用管道之前转换数据帧
答案 0 :(得分:0)
只是一个简单的版本
import pandas as pd
from sklearn import preprocessing
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
df = pd.DataFrame({'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
定义如何选择变量
class ItemSelector():
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
现在class
用于编码器
class MyLEncoder():
def transform(self, X, y=None, **fit_params):
enc = preprocessing.LabelEncoder()
encc = enc.fit(X)
enc_data = enc.transform(X)
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
和管道
encoding_pipeline =Pipeline([
('union', FeatureUnion(
transformer_list=[
('categorical', Pipeline([
('selector', ItemSelector(key='c')),
('LabelEncoder', MyLEncoder()) ]))
]))
])
和
X = df
encoding_pipeline.fit_transform(X)
array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2], dtype=int64)
如果您需要与算法一起使用,还需要更多详细信息