无法使用FeatureUnion在Python中组合已处理的数字和分类功能

时间:2018-02-26 18:13:19

标签: pandas machine-learning scikit-learn normalization feature-extraction

我正在尝试使用Age and Gender来预测Med,但我不熟悉Scikit-Learning的Pipeline和FeatureUnion,并遇到了一些问题。我读了一些tutorialanswer,这就是我编写下面代码的方法,但我不太清楚如何将拆分数据提供给管道函数。

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix

# Import data into Pandas data frame
data_directory = 'C:/Users/Asus/'
file_name = 'Example.csv'

df = pd.read_csv(data_directory + file_name)
df_len = len(df)

# Get a lit of all variables
print (list(df))

# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names
    def fit (self, X, y=None, **fit_params):
        return self
    def transform(self, X):
        return X[self.names]

numeric = [] # list of numeric column names
categorical = [] # list of categorical column names

# Creating random subsample for fast model building
def sample_n(df, n, replace=False, weight=None, seed=None):
    """Sample n rows from a DataFrame at random"""
    rs = np.random.RandomState(seed)
    locs = rs.choice(df.shape[0], size=n, replace=replace, p=weight)
    return df.take(locs, axis=0)

df = sample_n(df, n=300, seed=1123)

# Merge FG-LAI, SG-LAI and Both-LAI together into one group (MED=3)
df.ix[(df['MED']==4)|(df['MED']==5), 'MED']=3

# Remove No-Med (MED=1) and Both-LAI (MED=5) cases
df = df.drop(df[(df['MED']==1)|(df['MED']==5)].index)

# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)

# Retain only the needed predictors
X = X.filter(['age', 'gender'])

# Find the numerical columns, exclude categorical columns
X_num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5, 
                                                    random_state=567, 
                                                    stratify=y)

# Pipeline
pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
    ])),
    ('model', LogisticRegression())
])

# Declare hyperparameters
hyperparameters = {'logisticregression__c' : [0.01, 0.1, 1.0, 10.0],
                    'logisticregression__penalty' : ['l1', 'l2'],
                    'logisticregression__multi_class': ['ovr'],
                    'logisticregression__class_weight': ['balanced', None],
                    }

# SKlearn cross-validation with pipeline
clf = GridSearchCV(pipe, hyperparameters, cv=10)

# Fit and tune model
clf.fit(X_train, y_train)

错误:

ValueError: Invalid parameter logisticregression for estimator Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('columns', Columns(names=[])), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('categorical', Pipeline(memory=None,
     steps=[('columns', Columns(nam...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

编辑:

print (pipe.get_params().keys())

给出

dict_keys(['memory', 'steps', 'features', 'LR_model', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__numeric', 'features__categorical', 'features__numeric__memory', 'features__numeric__steps', 'features__numeric__columns', 'features__numeric__standardscaler', 'features__numeric__columns__names', 'features__numeric__standardscaler__copy', 'features__numeric__standardscaler__with_mean', 'features__numeric__standardscaler__with_std', 'features__categorical__memory', 'features__categorical__steps', 'features__categorical__columns', 'features__categorical__onehotencoder', 'features__categorical__columns__names', 'features__categorical__onehotencoder__categorical_features', 'features__categorical__onehotencoder__dtype', 'features__categorical__onehotencoder__handle_unknown', 'features__categorical__onehotencoder__n_values', 'features__categorical__onehotencoder__sparse', 'LR_model__C', 'LR_model__class_weight', 'LR_model__dual', 'LR_model__fit_intercept', 'LR_model__intercept_scaling', 'LR_model__max_iter', 'LR_model__multi_class', 'LR_model__n_jobs', 'LR_model__penalty', 'LR_model__random_state', 'LR_model__solver', 'LR_model__tol', 'LR_model__verbose', 'LR_model__warm_start'])

更改为'model__'后,我收到新错误:

ValueError: Found array with 0 feature(s) (shape=(109, 0)) while a minimum of 1 is required by StandardScaler.

编辑2:

# Retain only the needed predictors
#X = X.filter(['age', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num'])
X_selected = X.filter(['age', 'Geo', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num',
    'DAD_readmit', 'Num_DAD_readmit', 'ED_readmit', 'NUmber_ED_readmit'
    'Fail_renew', 'FR_num'])

# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['Geo', 'ccis']) # hand selected since some cat var has value 0, 1

# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, 
                                                    test_size=0.5, 
                                                    random_state=567, 
                                                    stratify=y)

# Pipeline
pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=X_cat_cols),OneHotEncoder(sparse=False)))
    ])),
    ('LR_model', LogisticRegression())
])

错误:

ValueError: could not convert string to float: 'Urban'

1 个答案:

答案 0 :(得分:1)

OneHotEncoder的输入数组是int,但您提供了string。您可以使用LabelEncoderLabelBinarizerstring转换为int。然后,您将被允许使用OneHotEncoder

pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=X_cat_cols),LabelEncoder(), OneHotEncoder(sparse=False)))
    ])),
    ('LR_model', LogisticRegression())
])