我正在寻找Python中Yeo-Johnson power transformation的实现。不幸的是,SciPy库有box-cox的实现(仅适用于正数)!
目前,我正在通过RPY2使用R包(caret::preProcess);然而,它的性能大受欢迎。
由于
P.S。:从Python调用caret :: preProcess:
#!/usr/bin/env python
from rpy2.robjects.functions import SignatureTranslatedFunction
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r_caret = importr("caret")
r_stats = importr("stats")
feature_in = list(range(-5, 5))
r_caret.preProcess = SignatureTranslatedFunction(r_caret.preProcess, {'r_lambda': 'lambda'})
r_scale = r_caret.preProcess(robjects.DataFrame({'a': robjects.FloatVector(feature_in)}),
method=robjects.StrVector(["center", "scale", "YeoJohnson"]),
r_lambda=robjects.IntVector([-10, 10, 0.1]),
na_remove=False)
temp = r_stats.predict(r_scale, robjects.DataFrame({'a': robjects.FloatVector(feature_in)}))
feature_out = np.array(temp.rx2(1), dtype=np.float64)
print(feature_in) # output [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
print(feature_out) # output [-1.46625361 -1.14751589 -0.82725836 -0.50508336 -0.1803113 0.14850228 0.48337191 0.8224039 1.16416754 1.5079769]
解决方案如下:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import warnings
import numpy as np
import pandas as pd
import sys
__author__ = "Mohsen Mesgarpour"
__copyright__ = "Copyright 2016, https://github.com/mesgarpour"
__credits__ = ["Mohsen Mesgarpour"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "Mohsen Mesgarpour"
__email__ = "mohsen.mesgarpour@gmail.com"
class YeoJohnson:
"""
Computing Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation
but can handle both positive and negative values.
References:
Weisberg, S. (2001). Yeo-Johnson Power Transformations.
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003.
https://www.stat.umn.edu/arc/yjpower.pdf
Adapted from CRAN - Package VGAM
"""
def fit(self, y, lmbda, derivative=0, epsilon=np.finfo(np.float).eps, inverse=False):
"""
:param y: The variable to be transformed (numeric array).
:param lmbda: The function's Lambda value (numeric value or array).
:param derivative: The derivative with respect to lambda
(non-negative integer; default: ordinary function evaluation).
:param epsilon: The lambda's tolerance (positive value).
:param inverse: The inverse transformation option (logical value).
:return: The Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y.
"""
# Validate arguments
self.__validate(y, lmbda, derivative, epsilon, inverse)
# initialise
y = np.array(y, dtype=float)
result = y
if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)):
lmbda, y = np.broadcast_arrays(lmbda, y)
lmbda = np.array(lmbda, dtype=float)
l0 = np.abs(lmbda) > epsilon
l2 = np.abs(lmbda - 2) > epsilon
# Inverse
with warnings.catch_warnings(): # suppress warnings
warnings.simplefilter("ignore")
if inverse is True:
mask = np.where(((y >= 0) & l0) == True)
result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1
mask = np.where(((y >= 0) & ~l0) == True)
result[mask] = np.expm1(y[mask])
mask = np.where(((y < 0) & l2) == True)
result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask]))
mask = np.where(((y < 0) & ~l2) == True)
result[mask] = -np.expm1(-y[mask])
# Derivative
else:
if derivative == 0:
mask = np.where(((y >= 0) & l0) == True)
result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask])
mask = np.where(((y >= 0) & ~l0) == True)
result[mask] = np.log1p(y[mask])
mask = np.where(((y < 0) & l2) == True)
result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask])
mask = np.where(((y < 0) & ~l2) == True)
result[mask] = -np.log1p(-y[mask])
# Not Derivative
else:
p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse)
mask = np.where(((y >= 0) & l0) == True)
result[mask] = np.divide(np.multiply(np.power(y[mask] + 1, lmbda[mask]),
np.power(np.log1p(y[mask]), derivative)) -
np.multiply(derivative, p[mask]), lmbda[mask])
mask = np.where(((y >= 0) & ~l0) == True)
result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1)
mask = np.where(((y < 0) & l2) == True)
result[mask] = np.divide(-(np.multiply(np.power(-y[mask] + 1, 2 - lmbda[mask]),
np.power(-np.log1p(-y[mask]), derivative)) -
np.multiply(derivative, p[mask])), 2 - lmbda[mask])
mask = np.where(((y < 0) & ~l2) == True)
result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1)
return result
@staticmethod
def __validate(y, lmbda, derivative, epsilon, inverse):
try:
if not isinstance(y, (list, np.ndarray, pd.Series)):
raise Exception("Argument 'y' must be a list!")
if not isinstance(lmbda, (int, float, np.int, np.float)):
if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y):
raise Exception("Argument 'lmbda' must be a number "
"or a list, which its length matches 'y' argument!")
if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0:
raise Exception("Argument 'derivative' must be a non-negative integer!")
if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0:
raise Exception("Argument 'epsilon' must be a positive number!")
if not isinstance(inverse, bool):
raise Exception("Argument 'inverse' must be boolean!")
if inverse is True and derivative != 0:
raise Exception("Argument 'derivative' must be zero "
"when argument 'inverse' is 'True'!")
except ():
sys.exit()
答案 0 :(得分:1)
使用numpy,这些东西相当简单:
In [2]: def yj(lam, y):
lam, y = np.broadcast_arrays(lam, y)
l0 = lam == 0
l2 = lam == 2
result = np.empty_like(lam, dtype=float)
mask1 = (~l0) & (y >= 0)
ym, lm = y[mask1], lam[mask1]
result[mask1] = ((ym + 1.)**lm - 1.) / lm
mask2 = (l0) & (y >= 0)
result[mask2] = np.log1p(y[mask2])
# TODO: handle two more cases here, this is similar
return result
请注意,上面是一个未经测试的演示,它不会检查dtypes,也不会对边缘情况(小y,小lambda)等进行任何处理。