自定义scikit编码器抛出转换错误

时间:2018-07-11 23:21:10

标签: python machine-learning scikit-learn one-hot-encoding

我正在修改一些在线代码来创建自己的scikit学习One Hot Encoder版本。自定义类为我做了一些事情,大多数情况下,它允许设置一个阈值,在该阈值以下,罕见类别的分类变量将被转储到“其他”类中。 我可以正确安装,但是当我尝试进行转换时,出现转换错误,好像在OHE无法正常工作之前运行的嵌入式LabelEncoder一样。

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np

#my custom OHE-er
class CustomPandasTransformer(BaseEstimator, TransformerMixin):
    def _validate_input(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError('X must be a DataFrame, but gor type=%s' % type(x))
        return X

    @staticmethod
    def _validate_columns(X, cols):
        scols = set(X.columns)
        if not all(c in scols for c in cols):
            raise ValueError("all columns must be present in X")

class DummyEncoder(CustomPandasTransformer):
    def __init__(self, columns, sep='_', drop_one_level=True, tmp_nan_rep='MISSING',other_treshold=20):
        self.columns=columns
        self.sep = sep
        self.drop_one_level = drop_one_level
        self.tmp_nan_rep = tmp_nan_rep
        self.other_treshold = other_treshold


    def fit(self, X, y=None):
        X = self._validate_input(X).copy()

        tmp_nan = self.tmp_nan_rep
        oth_thr = self.other_treshold

        cols = self.columns
        self._validate_columns(X, cols)

        lab_encoders = {}
        for col in cols:
            #group low freq levels into a 'OTHER' level
            tmp_vc = X[col].value_counts()
            high_volume_levels = list(tmp_vc[tmp_vc>oth_thr].index)
            vec = [v if v in high_volume_levels else 'OTHER' for v in X[col].tolist()]

            vec = [tmp_nan if pd.isnull(v) else v for v in vec]
            svec = list(set(vec))
            if tmp_nan not in svec:
                svec.append(tmp_nan)

            le = LabelEncoder()
            lab_encoders[col] = le.fit(svec)

            X[col] = le.transform(vec)

        ohe_set = X[cols]
        ohe_nan_row = {c: lab_encoders[c].transform([tmp_nan])[0] for c in cols}
        ohe_set = ohe_set.append(ohe_nan_row, ignore_index=True)
        ohe = OneHotEncoder(sparse=False).fit(ohe_set)

        self.ohe_ = ohe
        self.le_ = lab_encoders
        self.cols_ = cols

        return self

    def transform(self, X):
        check_is_fitted(self, 'ohe_')
        X = self._validate_input(X).copy()

        ohe = self.ohe_
        lenc = self.le_
        cols = self.cols_
        tmp_nan = self.tmp_nan_rep
        sep = self.sep
        drop = self.drop_one_level

        self._validate_columns(X, cols)
        col_order = []
        drops = []

        for col in cols:
            le = lenc[col]

            vec = [v if v in list(le.classes_) else 'OTHER' for v in X[col].tolist()]
            vec = [tmp_nan if pd.isnull(v) else v for v in vec]

            vec_trans = le.transform(vec)
            X[col] = vec_trans

            le_clz = le.classes_.tolist()
            classes = ['%s%s%s' % (col,sep,clz) for clz in le_clz]
            col_order.extend(classes)

            if drop and len(le_clz)>1:
                drops.append(classes[-1])

            ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
                                                columns = col_order)

            ohe_trans.index=X.index

            if drops:
                ohe_trans = ohe_trans.drop(drops, axis=1)

            X = X.drop(cols, axis=1)

            X = pd.concat([X, ohe_trans], axis=1)
            return X

#the data
dicpd = {
 u'BILL_CLASS': {0: np.nan, 1: np.nan},
 u'CL_SUB_TYPE': {0: 'M', 1: 'M'},
 u'COB_TYPE': {0: np.nan, 1: np.nan},
 u'DUP_BILL_CLASS': {0: np.nan, 1: np.nan},
 u'DUP_CL_SUB_TYPE': {0: 'M', 1: 'M'},
 u'DUP_COB_TYPE': {0: np.nan, 1: np.nan},
 u'DUP_DX_CD': {0: 'M9901', 1: 'Z0100'},
 u'DUP_FAC_TYPE': {0: np.nan, 1: np.nan},
 u'DUP_FREQUENCY': {0: np.nan, 1: np.nan},
 u'DUP_LOB_ID': {0: 'PBC1', 1: 'PBC1'},
 u'DUP_MOD1': {0: np.nan, 1: np.nan},
 u'DUP_MOD2': {0: np.nan, 1: np.nan},
 u'DUP_MOD3': {0: np.nan, 1: np.nan},
 u'DUP_MOD4': {0: np.nan, 1: np.nan},
 u'DUP_POS_CD': {0: '11', 1: '11'},
 u'DUP_PROC_CD': {0: '98941', 1: 'V2020'},
 u'DUP_REV_CD': {0: np.nan, 1: np.nan},
 u'DX_CD': {0: 'M9901', 1: 'Z0100'},
 u'FAC_TYPE': {0: np.nan, 1: np.nan},
 u'FREQUENCY': {0: np.nan, 1: np.nan},
 u'MBR_AGE': {0: 48, 1: 56},
 u'MOD1': {0: '59', 1: np.nan},
 u'MOD2': {0: np.nan, 1: np.nan},
 u'MOD3': {0: np.nan, 1: np.nan},
 u'MOD4': {0: np.nan, 1: np.nan},
 u'POS_CD': {0: '11', 1: '11'},
 u'PROC_CD': {0: '97140', 1: 'V2781'},
 u'REV_CD': {0: np.nan, 1: np.nan},
 u'RULE_1': {0: 0, 1: 0},
 u'RULE_3': {0: 1, 1: 1},
 u'RULE_4': {0: 1, 1: 1},
 u'RULE_5': {0: 0, 1: 0},
 u'RULE_6': {0: 1, 1: 1},
 'SAME_DX': {0: 1, 1: 1},
 'SAME_POS_CD': {0: 1, 1: 1},
 'SAME_PROC': {0: 0, 1: 0},
 'SAME_PROV': {0: 1, 1: 1},
 'SAME_REV': {0: 0, 1: 0},
 'SAME_TOT': {0: 3, 1: 3},
 u'SYSTEM_GEN_DRG': {0: np.nan, 1: np.nan}}

#read into pandas
df1 = pd.DataFrame(dicpd)

#get object types
categorical_features = list(df1.select_dtypes(include=['object']).columns)

de = DummyEncoder(columns = categorical_features,other_treshold=10,drop_one_level=False)

de.fit(df1)

de.transform(df1)


ValueErrorTraceback (most recent call last)
<ipython-input-5-3a72a3fa8104> in <module>()
    161 de.fit(df1)
    162 
--> 163 de.transform(df1)

<ipython-input-5-3a72a3fa8104> in transform(self, X)
     95                 drops.append(classes[-1])
     96 
---> 97             ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
     98                                                 columns = col_order)
     99 

/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in transform(self, X)
   2073         """
   2074         return _transform_selected(X, self._transform,
-> 2075                                    self.categorical_features, copy=True)
   2076 
   2077 

/data/dataiku-dss-4.2.3/python.packages/sklearn/preprocessing/data.pyc in _transform_selected(X, transform, selected, copy)
   1807     X : array or sparse matrix, shape=(n_samples, n_features_new)
   1808     """
-> 1809     X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
   1810 
   1811     if isinstance(selected, six.string_types) and selected == "all":

/data/dataiku-dss-4.2.3/python.packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

ValueError: could not convert string to float: V2781

1 个答案:

答案 0 :(得分:0)

我发现了错误。 整个部分

ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]),
                                                columns = col_order)

            ohe_trans.index=X.index

            if drops:
                ohe_trans = ohe_trans.drop(drops, axis=1)

            X = X.drop(cols, axis=1)

位于列循环内。需要在上一个循环完成后运行。