我尝试使用分类变量构建管道
import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model
from sklearn.pipeline import Pipeline
df = pd.DataFrame({'a':range(6), 'c':['a', 'b', 'c']*2, 'd': ['m', 'f']*3 })
X = df[['c', 'd']]
y = df['a']
regressor = linear_model.SGDRegressor()
转换分类变量
class Cat(TransformerMixin):
def transform(self, X, **transform_params):
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(X.T.to_dict().values())
return enc_data
def fit(self, X, y=None, **fit_params):
return self
管道
pipeline = Pipeline([
('categorical', Cat()),
('model_fitting', regressor),
])
pipeline.fit(X, y)
那是对的。但是当我尝试拟合新数据集时,我会收到错误。例如
contr = pd.DataFrame({'c':['a'], 'd': ['m']})
pred = pipeline.predict(contr)
pred
and
ValueError: shapes (1,2) and (5,) not aligned: 2 (dim 1) != 5 (dim 0)
我在类Cat(TransformerMixin)中遇到问题。如何改进?
答案 0 :(得分:0)
我已完成如下操作。它的工作原理
我改变了功能
class Cat(TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(df[['c', 'd']].T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
新数据集
control = pd.DataFrame({'c':['b'], 'd': ['f']})
pred = pipeline.predict(control)
pred