我编写了一个自定义的Transformer对象,它引入了一个时间延迟的信号嵌入。我想将其超参数包含到GridSearchCV
中。问题是它需要一个后处理步骤来删除从pandas shift
操作引入的空值的行。据我了解,变形金刚不是为了处理目标而设计的,尽管我找到了一种方法来实现目标。问题是,这是一种合法的方法,还是有一种不太常见的方法呢?
时间延迟嵌入变换器
from pandas import Series, DataFrame
from pandas.tools.merge import concat
from sklearn.base import TransformerMixin
from typing import Union
class TimeDelayEmbedder(TransformerMixin):
"""
A Transformer to add time-delay embedding to an existing time history.
"""
def __init__(self, min_delay: int, max_delay: int):
"""
Create a new TimeDelayEmbedder.
:param min_delay: The minimum delay to use.
:param max_delay: The maximum delay to use.
"""
self._min_delay = min(min_delay, max_delay)
self._max_delay = max(min_delay, max_delay)
def fit(self, X=None, y=None):
return self
def transform(self, X: Union[Series, DataFrame], y: Series=None):
"""
Transform the input data `X`. The returned DataFrame will have null values,
which will need to be handled separately e.g. with a NullRowDropper.
:param X: A pandas Series or DataFrame of time histories.
For DataFrames, each column will have a history added.
:param y: The targets. These will be unaffected.
:rtype: DataFrame
"""
if type(X) is Series:
assert X.name, 'Input Series must have a name.'
X = X.to_frame(name=X.name)
series_list = []
for column in X.columns:
for time_step in range(self._min_delay, self._max_delay + 1):
data = X[column].shift(time_step)
if time_step < 0:
name = '%s__t+%i' % (column, time_step)
elif time_step == 0:
name = '%s__t' % column
else:
name = '%s__t-%i' % (column, time_step)
series_list.append(Series(data=data, name=name))
df_out = concat(series_list, axis=1)
return df_out
空行丢弃转换器是:
from pandas import DataFrame, Series
from sklearn.base import TransformerMixin
class NullRowDropper(TransformerMixin):
"""
Drop null rows from a data-set.
Basically a pass-through to pandas dropna() but drops associated rows on the targets too.
"""
def __init__(self, how='any'):
"""
Create a new NullRowDropper.
:param how: `'any'` or `'all'`
"""
assert how in ('any', 'all'), "param: 'how' must be either 'any' or 'all'"
self._how = how
def fit(self, X: DataFrame=None, y: Series=None):
self._y = y # save y here so can modify its index inplace later (y is not passed to transform)
return self
def transform(self, X: DataFrame, y: Series=None):
"""
Drop the null rows according to the `how` param given at initialisation.
:param X: The input pandas DataFrame.
:rtype: DataFrame
"""
X_out = X.dropna(how=self._how)
self._y.drop(set(self._y.index) - set(X_out.index), inplace=True)
return X.dropna(how=self._how)