我正在修改一些在线代码来创建自己的scikit学习One Hot Encoder版本。自定义类为我做了一些事情,大多数情况下,它允许设置一个阈值,在该阈值以下,罕见类别的分类变量将被转储到“其他”类中。 我可以正确安装,但是当我尝试进行转换时,出现转换错误,好像在OHE无法正常工作之前运行的嵌入式LabelEncoder一样。
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np
#my custom OHE-er
class CustomPandasTransformer(BaseEstimator, TransformerMixin):
def _validate_input(self, X):
if not isinstance(X, pd.DataFrame):
raise TypeError('X must be a DataFrame, but gor type=%s' % type(x))
return X
@staticmethod
def _validate_columns(X, cols):
scols = set(X.columns)
if not all(c in scols for c in cols):
raise ValueError("all columns must be present in X")
class DummyEncoder(CustomPandasTransformer):
def __init__(self, columns, sep='_', drop_one_level=True, tmp_nan_rep='MISSING',other_treshold=20):
self.columns=columns
self.sep = sep
self.drop_one_level = drop_one_level
self.tmp_nan_rep = tmp_nan_rep
self.other_treshold = other_treshold
def fit(self, X, y=None):
X = self._validate_input(X).copy()
tmp_nan = self.tmp_nan_rep
oth_thr = self.other_treshold
cols = self.columns
self._validate_columns(X, cols)
lab_encoders = {}
for col in cols:
#group low freq levels into a 'OTHER' level
tmp_vc = X[col].value_counts()
high_volume_levels = list(tmp_vc[tmp_vc>oth_thr].index)
vec = [v if v in high_volume_levels else 'OTHER' for v in X[col].tolist()]
vec = [tmp_nan if pd.isnull(v) else v for v in vec]
svec = list(set(vec))
if tmp_nan not in svec:
svec.append(tmp_nan)
le = LabelEncoder()
lab_encoders[col] = le.fit(svec)
X[col] = le.transform(vec)
ohe_set = X[cols]
ohe_nan_row = {c: lab_encoders[c].transform([tmp_nan])[0] for c in cols}
ohe_set = ohe_set.append(ohe_nan_row, ignore_index=True)
ohe = OneHotEncoder(sparse=False).fit(ohe_set)
self.ohe_ = ohe
self.le_ = lab_encoders
self.cols_ = cols
return self
def transform(self, X):
check_is_fitted(self, 'ohe_')
X = self._validate_input(X).copy()
ohe = self.ohe_
lenc = self.le_
cols = self.cols_
tmp_nan = self.tmp_nan_rep
sep = self.sep
drop = self.drop_one_level
self._validate_columns(X, cols)
col_order = []
drops = []
for col in cols:
le = lenc[col]
vec = [v if v in list(le.classes_) else 'OTHER' for v in X[col].tolist()]
vec = [tmp_nan if pd.isnull(v) else v for v in vec]
vec_trans = le.transform(vec)
X[col] = vec_trans
le_clz = le.classes_.tolist()
classes = ['%s%s%s' % (col,sep,clz) for clz in le_clz]
col_order.extend(classes)
if drop and len(le_clz)>1:
drops.append(classes[-1])
ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
columns = col_order)
ohe_trans.index=X.index
if drops:
ohe_trans = ohe_trans.drop(drops, axis=1)
X = X.drop(cols, axis=1)
X = pd.concat([X, ohe_trans], axis=1)
return X
#the data
dicpd = {
u'BILL_CLASS': {0: np.nan, 1: np.nan},
u'CL_SUB_TYPE': {0: 'M', 1: 'M'},
u'COB_TYPE': {0: np.nan, 1: np.nan},
u'DUP_BILL_CLASS': {0: np.nan, 1: np.nan},
u'DUP_CL_SUB_TYPE': {0: 'M', 1: 'M'},
u'DUP_COB_TYPE': {0: np.nan, 1: np.nan},
u'DUP_DX_CD': {0: 'M9901', 1: 'Z0100'},
u'DUP_FAC_TYPE': {0: np.nan, 1: np.nan},
u'DUP_FREQUENCY': {0: np.nan, 1: np.nan},
u'DUP_LOB_ID': {0: 'PBC1', 1: 'PBC1'},
u'DUP_MOD1': {0: np.nan, 1: np.nan},
u'DUP_MOD2': {0: np.nan, 1: np.nan},
u'DUP_MOD3': {0: np.nan, 1: np.nan},
u'DUP_MOD4': {0: np.nan, 1: np.nan},
u'DUP_POS_CD': {0: '11', 1: '11'},
u'DUP_PROC_CD': {0: '98941', 1: 'V2020'},
u'DUP_REV_CD': {0: np.nan, 1: np.nan},
u'DX_CD': {0: 'M9901', 1: 'Z0100'},
u'FAC_TYPE': {0: np.nan, 1: np.nan},
u'FREQUENCY': {0: np.nan, 1: np.nan},
u'MBR_AGE': {0: 48, 1: 56},
u'MOD1': {0: '59', 1: np.nan},
u'MOD2': {0: np.nan, 1: np.nan},
u'MOD3': {0: np.nan, 1: np.nan},
u'MOD4': {0: np.nan, 1: np.nan},
u'POS_CD': {0: '11', 1: '11'},
u'PROC_CD': {0: '97140', 1: 'V2781'},
u'REV_CD': {0: np.nan, 1: np.nan},
u'RULE_1': {0: 0, 1: 0},
u'RULE_3': {0: 1, 1: 1},
u'RULE_4': {0: 1, 1: 1},
u'RULE_5': {0: 0, 1: 0},
u'RULE_6': {0: 1, 1: 1},
'SAME_DX': {0: 1, 1: 1},
'SAME_POS_CD': {0: 1, 1: 1},
'SAME_PROC': {0: 0, 1: 0},
'SAME_PROV': {0: 1, 1: 1},
'SAME_REV': {0: 0, 1: 0},
'SAME_TOT': {0: 3, 1: 3},
u'SYSTEM_GEN_DRG': {0: np.nan, 1: np.nan}}
#read into pandas
df1 = pd.DataFrame(dicpd)
#get object types
categorical_features = list(df1.select_dtypes(include=['object']).columns)
de = DummyEncoder(columns = categorical_features,other_treshold=10,drop_one_level=False)
de.fit(df1)
de.transform(df1)
ValueErrorTraceback (most recent call last)
<ipython-input-5-3a72a3fa8104> in <module>()
161 de.fit(df1)
162
--> 163 de.transform(df1)
<ipython-input-5-3a72a3fa8104> in transform(self, X)
95 drops.append(classes[-1])
96
---> 97 ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
98 columns = col_order)
99
/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in transform(self, X)
2073 """
2074 return _transform_selected(X, self._transform,
-> 2075 self.categorical_features, copy=True)
2076
2077
/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in _transform_selected(X, transform, selected, copy)
1807 X : array or sparse matrix, shape=(n_samples, n_features_new)
1808 """
-> 1809 X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
1810
1811 if isinstance(selected, six.string_types) and selected == "all":
/data/dataiku-dss-4.2.3/python.packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: could not convert string to float: V2781
答案 0 :(得分:0)
我发现了错误。 整个部分
ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
columns = col_order)
ohe_trans.index=X.index
if drops:
ohe_trans = ohe_trans.drop(drops, axis=1)
X = X.drop(cols, axis=1)
位于列循环内。需要在上一个循环完成后运行。