我试图理解这个二进制编码器背后的逻辑。
它会自动获取分类变量和虚拟代码(类似于sklearn上的单热编码),但减少的输出列数等于唯一值长度的log2。
基本上,当我使用这个库时,我注意到我的虚拟变量只限于几个唯一值。经过进一步调查后,我注意到了这个@staticmethod
,它在分类变量中取得了唯一值len的log2。
我的问题是为什么?我意识到这会降低输出数据的维数,但这样做背后的逻辑是什么?如何获取log2来确定表示数据需要多少位数?
def calc_required_digits(X, col):
"""
figure out how many digits we need to represent the classes present
"""
return int( np.ceil(np.log2(len(X[col].unique()))) )
完整源代码:
"""Binary encoding"""
import copy
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.utils import get_obj_cols, convert_input
__author__ = 'willmcginnis'
[docs]class BinaryEncoder(BaseEstimator, TransformerMixin):
"""Binary encoding for categorical variables, similar to onehot, but stores categories as binary bitstrings.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
impute_missing: bool
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
handle_unknown: str
options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
unexpected changes in dimension in some cases.
Example
-------
>>>from category_encoders import *
>>>import pandas as pd
>>>from sklearn.datasets import load_boston
>>>bunch = load_boston()
>>>y = bunch.target
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>>enc = BinaryEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>>numeric_dataset = enc.transform(X)
>>>print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
CHAS_0 506 non-null int64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(5)
memory usage: 63.3 KB
None
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.impute_missing = impute_missing
self.handle_unknown = handle_unknown
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.digits_per_col = {}
[docs] def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
# first check the type
X = convert_input(X)
self._dim = X.shape[1]
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
# train an ordinal pre-encoder
self.ordinal_encoder = OrdinalEncoder(
verbose=self.verbose,
cols=self.cols,
impute_missing=self.impute_missing,
handle_unknown=self.handle_unknown
)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
for col in self.cols:
self.digits_per_col[col] = self.calc_required_digits(X, col)
# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
[docs] def transform(self, X):
"""Perform the transformation to new categorical data.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
p : array, shape = [n_samples, n_numeric + N]
Transformed values with encoding applied.
"""
if self._dim is None:
raise ValueError('Must train encoder before it can be used to transform data.')
# first check the type
X = convert_input(X)
# then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, ))
if not self.cols:
return X
X = self.ordinal_encoder.transform(X)
X = self.binary(X, cols=self.cols)
if self.drop_invariant:
for col in self.drop_cols:
X.drop(col, 1, inplace=True)
if self.return_df:
return X
else:
return X.values
[docs] def binary(self, X_in, cols=None):
"""
Binary encoding encodes the integers as binary code with one column per digit.
"""
X = X_in.copy(deep=True)
if cols is None:
cols = X.columns.values
pass_thru = []
else:
pass_thru = [col for col in X.columns.values if col not in cols]
bin_cols = []
for col in cols:
# get how many digits we need to represent the classes present
digits = self.digits_per_col[col]
# map the ordinal column into a list of these digits, of length digits
X[col] = X[col].map(lambda x: self.col_transform(x, digits))
for dig in range(digits):
X[str(col) + '_%d' % (dig, )] = X[col].map(lambda r: int(r[dig]) if r is not None else None)
bin_cols.append(str(col) + '_%d' % (dig, ))
X = X.reindex(columns=bin_cols + pass_thru)
return X
[docs] @staticmethod
def calc_required_digits(X, col):
"""
figure out how many digits we need to represent the classes present
"""
return int( np.ceil(np.log2(len(X[col].unique()))) )
[docs] @staticmethod
def col_transform(col, digits):
"""
The lambda body to transform the column values
"""
if col is None or float(col) < 0.0:
return None
else:
col = list("{0:b}".format(int(col)))
if len(col) == digits:
return col
else:
return [0 for _ in range(digits - len(col))] + col
答案 0 :(得分:5)
我的问题是为什么?我意识到这减少了维数 输出数据,但这背后的逻辑是什么?
基本上,分类编码的问题是让你的算法处理分类特征。因此,有几种方法可以实现,包括二进制编码。实际上,如果您了解它,它的逻辑接近于一个热编码(OHE)的逻辑。
对于二进制编码,分类向量中的每个唯一标签都随机关联到(0)和(唯一标签数量-1)之间的数字。现在,您将此数字编码在基数2和&#34;成绩单&#34;通过新创建的列在0和1中的前一个数字。
例如,让我们将您的数据集称为三个不同的标签:&#39; A&#39;&#39; B&#39; &安培; &#39; C&#39 ;.
以下对应是随机构建的:
&#39; A&#39; - &GT; 1 - &gt; 01;
&#39; B&#39; - &GT; 2&gt; 10;
&#39; C&#39; - &GT; 0 - &gt; 00。
因此,给定数据集的编码示例是:
索引my_category enc_category_0 enc_category_1
0 A,1,0
1,B,0,1
2,C,0,0
3 A,1,0
关于它的效用,正如你所说,它降低了维度。此外,我猜它有助于在编码列中没有像OHE那样过多的零。这是一篇有趣的帖子:https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931
如何获取log2来确定表示数据需要多少位数? 如果您了解工作原理,则了解log2的使用。计算数字的log2将检索此数字的二进制编码所需的位数。示例:[log2(10)] = [3.32] = 4,二进制编码10需要4位数。
有关实现和代码示例的更多信息:http://contrib.scikit-learn.org/categorical-encoding/_modules/category_encoders/binary.html#BinaryEncoder
希望我很清楚,
Tchau