我正在尝试使用Age and Gender来预测Med,但我不熟悉Scikit-Learning的Pipeline和FeatureUnion,并遇到了一些问题。我读了一些tutorial和answer,这就是我编写下面代码的方法,但我不太清楚如何将拆分数据提供给管道函数。
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
# Import data into Pandas data frame
data_directory = 'C:/Users/Asus/'
file_name = 'Example.csv'
df = pd.read_csv(data_directory + file_name)
df_len = len(df)
# Get a lit of all variables
print (list(df))
# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit (self, X, y=None, **fit_params):
return self
def transform(self, X):
return X[self.names]
numeric = [] # list of numeric column names
categorical = [] # list of categorical column names
# Creating random subsample for fast model building
def sample_n(df, n, replace=False, weight=None, seed=None):
"""Sample n rows from a DataFrame at random"""
rs = np.random.RandomState(seed)
locs = rs.choice(df.shape[0], size=n, replace=replace, p=weight)
return df.take(locs, axis=0)
df = sample_n(df, n=300, seed=1123)
# Merge FG-LAI, SG-LAI and Both-LAI together into one group (MED=3)
df.ix[(df['MED']==4)|(df['MED']==5), 'MED']=3
# Remove No-Med (MED=1) and Both-LAI (MED=5) cases
df = df.drop(df[(df['MED']==1)|(df['MED']==5)].index)
# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)
# Retain only the needed predictors
X = X.filter(['age', 'gender'])
# Find the numerical columns, exclude categorical columns
X_num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
])),
('model', LogisticRegression())
])
# Declare hyperparameters
hyperparameters = {'logisticregression__c' : [0.01, 0.1, 1.0, 10.0],
'logisticregression__penalty' : ['l1', 'l2'],
'logisticregression__multi_class': ['ovr'],
'logisticregression__class_weight': ['balanced', None],
}
# SKlearn cross-validation with pipeline
clf = GridSearchCV(pipe, hyperparameters, cv=10)
# Fit and tune model
clf.fit(X_train, y_train)
错误:
ValueError: Invalid parameter logisticregression for estimator Pipeline(memory=None,
steps=[('features', FeatureUnion(n_jobs=1,
transformer_list=[('numeric', Pipeline(memory=None,
steps=[('columns', Columns(names=[])), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('categorical', Pipeline(memory=None,
steps=[('columns', Columns(nam...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
编辑:
print (pipe.get_params().keys())
给出
dict_keys(['memory', 'steps', 'features', 'LR_model', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__numeric', 'features__categorical', 'features__numeric__memory', 'features__numeric__steps', 'features__numeric__columns', 'features__numeric__standardscaler', 'features__numeric__columns__names', 'features__numeric__standardscaler__copy', 'features__numeric__standardscaler__with_mean', 'features__numeric__standardscaler__with_std', 'features__categorical__memory', 'features__categorical__steps', 'features__categorical__columns', 'features__categorical__onehotencoder', 'features__categorical__columns__names', 'features__categorical__onehotencoder__categorical_features', 'features__categorical__onehotencoder__dtype', 'features__categorical__onehotencoder__handle_unknown', 'features__categorical__onehotencoder__n_values', 'features__categorical__onehotencoder__sparse', 'LR_model__C', 'LR_model__class_weight', 'LR_model__dual', 'LR_model__fit_intercept', 'LR_model__intercept_scaling', 'LR_model__max_iter', 'LR_model__multi_class', 'LR_model__n_jobs', 'LR_model__penalty', 'LR_model__random_state', 'LR_model__solver', 'LR_model__tol', 'LR_model__verbose', 'LR_model__warm_start'])
更改为'model__'后,我收到新错误:
ValueError: Found array with 0 feature(s) (shape=(109, 0)) while a minimum of 1 is required by StandardScaler.
编辑2:
# Retain only the needed predictors
#X = X.filter(['age', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num'])
X_selected = X.filter(['age', 'Geo', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num',
'DAD_readmit', 'Num_DAD_readmit', 'ED_readmit', 'NUmber_ED_readmit'
'Fail_renew', 'FR_num'])
# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['Geo', 'ccis']) # hand selected since some cat var has value 0, 1
# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols),OneHotEncoder(sparse=False)))
])),
('LR_model', LogisticRegression())
])
错误:
ValueError: could not convert string to float: 'Urban'
答案 0 :(得分:1)
OneHotEncoder
的输入数组是int
,但您提供了string
。您可以使用LabelEncoder
或LabelBinarizer
将string
转换为int
。然后,您将被允许使用OneHotEncoder
。
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols),LabelEncoder(), OneHotEncoder(sparse=False)))
])),
('LR_model', LogisticRegression())
])