我大部分时间编写以下代码:
from sklearn.pipeline import Pipeline
from sklearn import linear_model
minmax_scaler = preprocessing.MinMaxScaler()
my_own_estimator = MyOwnEstimator()
lr = linear_model.LogisticRegression()
pipeline =Pipeline([
('minmax_scaler',minmax_scaler),
('my_own_estimator', my_own_estimator),
('lr', lr)
])
...
其中my_own_estimator
是BaseEstimator, TransformerMixin
的子类,遵循scikit-learn
的规范。
这将是封装此代码的推荐形式,并与scikit-learn
世界的其余部分兼容(我的意思是,将其用于预测等)?
我知道我可以保存这个管道,但我更感兴趣的是将该代码放在子类(或其他东西)中。
我应该从LogisticRegression
还是从Pipeline
继承?也许是另一种方法?
编辑:到目前为止我的想法是:
# coding: utf-8
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from my_stuff import MyOwnEstimator
class MyLogisticRegression(BaseEstimator, ClassifierMixin):
"""
An in-place replacement for the scikit-learn's LogisticRegression.
"""
def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
fit_intercept=True, intercept_scaling=1, class_weight=None,
random_state=None, solver='liblinear', max_iter=100,
multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
self.penalty = penalty
self.dual = dual
self.tol = tol
self.C = C
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.verbose = verbose
self.warm_start = warm_start
self.n_jobs = n_jobs
self.minmax_scaler = MinMaxScaler()
self.my_own_estimator = MyOwnEstimator()
self.lr = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C,
fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
random_state=random_state, solver=solver, max_iter=max_iter,
multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
self.pipeline =Pipeline([
('minmax_scaler', self.minmax_scaler),
('my_own_estimator', my_own_stimator),
('lr', self.lr)
])
def fit(self, X, y = None):
self.pipeline.fit(X, y)
self.min_ = self.pipeline.named_steps['minmax_scaler'].min_
self.scale_ = self.pipeline.named_steps['minmax_scaler'].scale_
self.data_min_ = self.pipeline.named_steps['minmax_scaler'].data_min_
self.data_max_ = self.pipeline.named_steps['minmax_scaler'].data_max_
self.data_range_ = self.pipeline.named_steps['minmax_scaler'].data_range_
self.coef_ = self.pipeline.named_steps['lr'].coef_
self.intercept_ = self.pipeline.named_steps['lr'].intercept_
self.classes_ = self.pipeline.named_steps['lr'].classes_
return self
def predict_proba(self, X):
return self.pipeline.predict_proba(X)
def predict_log_proba(self, X):
return self.pipeline.predict_log_proba(X)
def predict(self, X):
return self.pipeline.predict(X)
def score(self, X, y):
return self.pipeline.score(X,y)
它有意义吗?这是一个更好的方法吗?