我试图在scikit-learn Pipeline中优化超参数,并使用一些自定义变换器,但我一直在收到错误:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
class RollingMeanTransform(BaseEstimator, TransformerMixin):
def __init__(self, col, window=3):
self._window = window
self._col = col
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
return df
class TimeEncoding(BaseEstimator, TransformerMixin):
def __init__(self, col, drop_original=True):
self._col = col
self._drop_original = drop_original
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
unique_vals = float(len(X[self._col].unique()))
X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
if self._drop_original:
X.drop([self._col], axis=1, inplace=True, errors='ignore')
return X
huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]
huber_grid = {'clf__alpha':huber_alpha,
'clf__epsilon':huber_epsilon,
'clf__max_iter':huber_max_iter,
}
regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
('mean', RollingMeanTransform('my_other_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
我试着用这个:
grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))
grid.fit(X_train, y_train)
但我得到以下说明:
ValueError Traceback (most recent call last)
<ipython-input-14-3949096c802a> in <module>()
----> 1 grid.fit(X_train, y_train)
~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
456 estimator.fit(X_train, **fit_params)
457 else:
--> 458 estimator.fit(X_train, y_train, **fit_params)
459
460 except Exception as e:
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
246 This estimator
247 """
--> 248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
250 self._final_estimator.fit(Xt, y, **fit_params)
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
211 Xt, fitted_transformer = fit_transform_one_cached(
212 cloned_transformer, None, Xt, y,
--> 213 **fit_params_steps[name])
214 # Replace the transformer of the step with the fitted
215 # transformer. This is necessary when loading the transformer
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
360
361 def __call__(self, *args, **kwargs):
--> 362 return self.func(*args, **kwargs)
363
364 def call_and_shelve(self, *args, **kwargs):
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
518 else:
519 # fit method of arity 2 (supervised transformation)
--> 520 return self.fit(X, y, **fit_params).transform(X)
521
522
~/my_project/my_model.py in transform(self, X)
126 def transform(self, X):
127 X = X.copy()
--> 128 unique_vals = float(len(X[self._col].unique()))
129 X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
130 X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3850 loc = indexer.item()
3851 else:
-> 3852 raise ValueError("cannot label index with a null key")
3853
3854 return self.iget(loc, fastpath=fastpath)
ValueError: cannot label index with a null key
我不知道发生了什么,或者如何解决它。如果我移除变压器它可以工作,但我需要在我的管道中。
如果我将管道更改为
regression_pipeline = Pipeline([('mean', RollingMeanTransform('my_other_col')),
('encoding', TimeEncoding('my_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
我得到了同样的错误,但这一次调用了mean
变换器。
完整的代码示例:
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
class RollingMeanTransform(BaseEstimator, TransformerMixin):
def __init__(self, col, window=3):
self._window = window
self._col = col
def fit(self, X, y=None):
return self
def transform(self, X):
df = X.copy()
df['{}_rolling_mean'.format(self._col)] = df[self._col].shift(1).rolling(self._window).mean().fillna(0.0)
return df
class TimeEncoding(BaseEstimator, TransformerMixin):
def __init__(self, col, drop_original=True):
self._col = col
self._drop_original = drop_original
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
unique_vals = float(len(X[self._col].unique()))
X['sin_{}'.format(self._col)] = np.sin(2 * np.pi * X[self._col] / unique_vals)
X['cos_{}'.format(self._col)] = np.cos(2 * np.pi * X[self._col] / unique_vals)
if self._drop_original:
X.drop([self._col], axis=1, inplace=True, errors='ignore')
return X
class Treshold(BaseEstimator, TransformerMixin):
# note: Threshold which removes features with constant value
# and preserves the input data as data frame
def __init__(self):
self.to_keep = list()
def fit(self, X, y=None):
self.to_keep = list()
self.colname_original = X.columns
for i, col in enumerate(X):
if len(np.unique(X.values[:, i])) >= 2:
self.to_keep.append(col)
return self
def transform(self, X, copy=None):
return X[self.to_keep]
class Scale(BaseEstimator, TransformerMixin):
# note: scaler which keeps the input data as data frame
# and does not scale binary features
def __init__(self, copy=True, with_mean=True, with_std=True):
self.scaler = StandardScaler(copy, with_mean, with_std)
self.bin_vars_index = list()
self.cont_vars_index = list()
self.colnames_original = list()
def fit(self, X, y=None):
self.bin_vars_index = list()
self.cont_vars_index = list()
self.colnames_original = list()
self.colnames_original = X.columns
for i in range(X.shape[1]):
if len(np.unique(X.values[:, i])) <= 2:
self.bin_vars_index.append(i)
else:
self.cont_vars_index.append(i)
self.scaler.fit(X.values[:, self.cont_vars_index])
return self
def transform(self, X, copy=None):
X_tail = self.scaler.transform(X.values[:, self.cont_vars_index], copy)
res = np.concatenate((X.values[:, self.bin_vars_index], X_tail), axis=1)
colnames_res = np.array(
list(self.colnames_original[self.bin_vars_index]) + list(self.colnames_original[self.cont_vars_index]))
assert len(colnames_res) == len(self.colnames_original)
res = pd.DataFrame(data=res, columns=colnames_res)
return res[[str(el) for el in self.colnames_original]].set_index(X.index)
huber = HuberRegressor()
huber_max_iter = [100, 200, 500, 1000]
huber_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100]
huber_epsilon = [1.15, 1.25, 1.35, 1.5]
huber_grid = {'clf__alpha':huber_alpha,
'clf__epsilon':huber_epsilon,
'clf__max_iter':huber_max_iter,
}
regression_pipeline = Pipeline([('encoding', TimeEncoding('my_col')),
('mean', RollingMeanTransform('my_other_col')),
('select', Treshold()),
('scale', Scale()),
('clf', huber)
])
grid = GridSearchCV(regression_pipeline, huber_grid, cv=TimeSeriesSplit(n_splits=5))
X = pd.DataFrame(np.random.randint(low=0, high=10, size=(20, 2)), columns=['my_col', 'my_other_col'])
y = pd.Series(np.random.randint(low=0, high=10, size=(20,)))
grid.fit(X, y)
答案 0 :(得分:4)
您会看到GridSearchCV(以及scikit-learn中的大多数交叉验证实用程序)克隆提供的数据以执行网格搜索。
在这样做时,他们将使用您继承的BaseEstimator类的get_params()
and set_params()
。现在get_params()
将从您声明的__init__()
方法中获取参数。
init_signature = signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [p for p in init_signature.parameters.values()
if p.name != 'self' and p.kind != p.VAR_KEYWORD]
...
...
现在要获取值,请使用](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/base.py#L228):
for key in self._get_param_names():
value = getattr(self, key, None)
所以这将给出的参数是:
col = None
drop_original = None
不是您使用leading underscore
的人。两者的值均为None,因为您的对象没有具有这些名称的任何属性。
现在,这些无值参数将用于实例化克隆对象in clone()
:
...
new_object = klass(**new_object_params)
...
...
然后,这些None
值将设置为_col
和_drop_original
。这是错误的根源。
the deleloper guidelines in scikit中记录了这件事:
init 接受的参数都应该是关键字参数 使用默认值。换句话说,用户应该能够 实例化一个估算器而不传递任何参数。该 参数应该都对应于描述该参数的超参数 模型或估算器试图解决的优化问题。
此外, init 接受的每个关键字参数都应该 对应于实例上的属性。 Scikit-learn依赖于 这样可以在执行时找到在估算器上设置的相关属性 模型选择。
因此,建议的解决方法是从您的参数名称中删除前导下划线(以便__init__
和self
中的名称相同):
class TimeEncoding(BaseEstimator, TransformerMixin):
# Changed the names from _col to col
def __init__(self, col, drop_original=True):
self.col = col
self.drop_original = drop_original
def transform(self, X):
X = X.copy()
# Updated the names to be used
unique_vals = float(len(X[self.col].unique()))
X['sin_{}'.format(self.col)] = np.sin(2 * np.pi * X[self.col] / unique_vals)
X['cos_{}'.format(self.col)] = np.cos(2 * np.pi * X[self.col] / unique_vals)
if self.drop_original:
X.drop([self.col], axis=1, inplace=True, errors='ignore')
return X
现在为所有自定义估算器执行此操作。
现在,如果您对使用属性的前导下划线有一些限制(可能尝试将它们设为私有或类似),则第二个选项是覆盖set_params()
方法以显式设置参数。