我正在尝试运行https://github.com/combust/mleap-demo/tree/master/notebooks
中的示例 airbnb-price-regression-scikit.ipynb这个想法是在Python中生成一个简化的管道,将其导出到包中,然后在Scala中导入。
当我尝试执行
# Serialize the linear regression model
model_pipeline.serialize_to_bundle('/tmp', 'scikit-airbnb.lr', init=True)
生成以下错误:
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/Applications/PyCharm.app/Contents/helpers/pydev/_pydev_bundle/pydev_umd.py", line 198, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "/Applications/PyCharm.app/Contents/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/tullioc/PycharmProjects/untitled/airbnb/CreateSimpleModel.py", line 229, in <module>
model_pipeline.serialize_to_bundle('/git/spike', 'scikit-airbnb.lr.zip', init=True)
File "/Users/tullioc/PycharmProjects/untitled/venv/lib/python3.7/site-packages/mleap/sklearn/pipeline.py", line 29, in serialize_to_bundle
serializer.serialize_to_bundle(self, path, model_name, init)
File "/Users/tullioc/PycharmProjects/untitled/venv/lib/python3.7/site-packages/mleap/sklearn/pipeline.py", line 107, in serialize_to_bundle
step_i.serialize_to_bundle(bundle_dir, step_i.name)
File "/Users/tullioc/PycharmProjects/untitled/venv/lib/python3.7/site-packages/mleap/sklearn/preprocessing/data.py", line 295, in serialize_to_bundle
self.serialize(self, path, model_name, attributes, inputs, outputs)
File "/Users/tullioc/PycharmProjects/untitled/venv/lib/python3.7/site-packages/mleap/bundle/serialize.py", line 159, in serialize
json.dump(self.get_mleap_model(transformer, attributes), outfile, indent=3)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py", line 179, in dump
for chunk in iterable:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 431, in _iterencode
yield from _iterencode_dict(o, _current_indent_level)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 325, in _iterencode_list
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 325, in _iterencode_list
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 405, in _iterencode_dict
yield from chunks
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 438, in _iterencode
o = _default(o)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type int64 is not JSON serializable
我正在使用Python 3.7.2和mleap 0.8.1
有人成功创建和导出模型吗?任何帮助将不胜感激。
这是完整的示例:
import pandas as pd
import mleap.sklearn.pipeline
import mleap.sklearn.feature_union
import mleap.sklearn.base
import mleap.sklearn.logistic
import mleap.sklearn.preprocessing.data
from mleap.sklearn.preprocessing.data import FeatureExtractor, LabelEncoder, ReshapeArrayToN1
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
df = pd.read_csv('airbnb.csv', error_bad_lines=False, warn_bad_lines=False)
desired_width = 320
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', None)
print()
print("Input")
print(df[:5])
# ----------------------------- Play around with Dataframe -------------------------
def _transform_state(state):
if state in ['NY', 'CA', 'London', 'Berlin', 'TX', 'IL', 'OR', 'DC', 'WA']:
return state
return 'Other'
def _transform_yesno(value):
if value == 0.0:
return 'No'
return 'Yes'
# Have a look at the data available in the DataFrame and play around with it
print()
print("Average price per state")
print(
df[['state', 'price']].groupby('state').agg([np.size, np.mean]).sort_values(by=('price', 'size'),
ascending=False)[:10]
)
price_stats = df[['state', 'price']].groupby('state').agg([np.size, np.mean, np.max]).sort_values(by=('price', 'mean'),
ascending=False)
print()
print("Average price per state where size is at least 25")
print(
price_stats[price_stats[('price', 'size')] > 25][:10]
)
# ----------------------------- Transform Input data -------------------------
# convert to categorical feature
df['host_is_superhost'] = df['host_is_superhost'].apply(_transform_yesno)
df['instant_bookable'] = df['instant_bookable'].apply(_transform_yesno)
# normalize state
df['state'] = df.state.apply(_transform_state)
print()
print("Normalized input")
print(df[:5])
continuous_features = [
"bedrooms",
"square_feet",
"review_scores_rating"]
categorical_features = [
"room_type",
"state",
"instant_bookable"]
# ----------------------------- Fill missing values with mean -------------------------
# Selects a subset of features from a pandas dataframe that are then passed into a a subsequent transformer.
# MLeap treats this transformer like a VectorAssembler equivalent in spark.
imputed_continuous_features = ['imp_{}'.format(x) for x in continuous_features]
feature_extractor2_tf = FeatureExtractor(input_scalars=continuous_features,
output_vector='imputed_features',
# output_vector: Name of the output vector, only used for serialization
output_vector_items=imputed_continuous_features)
# If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
impute_mean_null_replacer = Imputer(strategy='mean', axis=0)
# impute_mean_null_replacer = SimpleImputer(strategy='mean')
impute_mean_null_replacer.mlinit(prior_tf=feature_extractor2_tf, output_features='imputed_features')
impute_pipeline = Pipeline([
(feature_extractor2_tf.name, feature_extractor2_tf),
(impute_mean_null_replacer.name, impute_mean_null_replacer)
])
impute_pipeline.mlinit()
df2 = df.join(pd.DataFrame(impute_pipeline.fit_transform(df), columns=feature_extractor2_tf.output_vector_items))
print()
print("-------df2[:5]--------")
print(df2[:5])
# ----------------------------- END Fill missing values with mean -------------------------
all_features = imputed_continuous_features + categorical_features
# First filter out outlier prices
df2 = df2[(df2.price >= 50) & (df2.price <= 500)]
print()
print("-------filtered df2[:5]--------")
print(df2[:5])
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(df2[all_features], df2[['price']], test_size=0.33, random_state=42)
# Step 4: Continous Feature Pipeline
feature_extractor_tf = FeatureExtractor(input_scalars=imputed_continuous_features,
output_vector='unscaled_cont_features',
output_vector_items=["scaled_{}".format(x) for x in imputed_continuous_features])
standard_scaler_tf = StandardScaler() # Standardize features by removing the mean and scaling to unit variance
standard_scaler_tf.mlinit(prior_tf=feature_extractor_tf, output_features='scaled_cont_features')
standard_scaler_pipeline = Pipeline([(feature_extractor_tf.name, feature_extractor_tf),
(standard_scaler_tf.name, standard_scaler_tf)])
standard_scaler_pipeline.mlinit()
df3 = pd.DataFrame(standard_scaler_pipeline.fit_transform(df2), columns=feature_extractor_tf.output_vector_items)
df3.index = df2.index
df3 = df2.join(df3)
print()
print("-------continuous df[:5]--------")
print(df3[:5])
# Step 5: Categorical Feature Pipeline
# TODO: Need to fix scikit's One-Hot-Encoder to drop the last column of a matrix if we're using it for ML
# One Hot Encoder is explained here:
# https://scikit-learn.org/stable/modules/preprocessing.html
def _create_le_one_hot_pipeline(feature_name):
feature_extractor3_tf = FeatureExtractor(input_scalars=[feature_name],
output_vector='{}_label'.format(feature_name),
output_vector_items=[feature_name])
# Label Encoder for x1 Label
label_encoder_tf = LabelEncoder(input_features=feature_extractor3_tf.output_vector_items,
output_features='{}_label_le'.format(feature_name))
# Reshape the output of the LabelEncoder to N-by-1 array
reshape_le_tf = ReshapeArrayToN1()
# Vector Assembler for x1 One Hot Encoder
one_hot_encoder_tf = OneHotEncoder(sparse=False)
one_hot_encoder_tf.mlinit(prior_tf=label_encoder_tf,
output_features='{}_label_one_hot_encoded'.format(feature_name))
one_hot_encoder_pipeline_x0 = Pipeline([
(feature_extractor3_tf.name, feature_extractor3_tf),
(label_encoder_tf.name, label_encoder_tf),
(reshape_le_tf.name, reshape_le_tf),
(one_hot_encoder_tf.name, one_hot_encoder_tf)
])
one_hot_encoder_pipeline_x0.mlinit()
return one_hot_encoder_pipeline_x0
oh_pipelines = [_create_le_one_hot_pipeline(x) for x in categorical_features]
oh_fes = [x.steps[-1][1] for x in oh_pipelines]
# Step 6: Assemble our features and feature pipeline
feature_union = FeatureUnion([
(standard_scaler_pipeline.name, standard_scaler_pipeline)
] + [(x.name, x) for x in oh_pipelines])
feature_union.mlinit()
# Step 7: Define our linear regression model
# Put all of the categorical features into a list
oh_features_lists = [[y[1].output_features for y in x.steps if y[1].op == 'one_hot_encoder'] for x in oh_pipelines]
oh_features = [item for sublist in oh_features_lists for item in sublist]
print()
print("-------oh_features--------")
print(oh_features)
# Vector Assembler, for serialization purposes only
feature_extractor_lr_model_tf = FeatureExtractor(input_vectors=[feature_extractor_tf] + oh_fes, output_vector='input_features', output_vector_items=[standard_scaler_tf.output_features] + oh_features)
feature_extractor_lr_model_tf.skip_fit_transform = True
# Define our linear regression
lr_model = LinearRegression()
lr_model.mlinit(input_features='input_features', prediction_column='price_prediction')
lr_model_pipeline = Pipeline([
(feature_extractor_lr_model_tf.name, feature_extractor_lr_model_tf),
(lr_model.name, lr_model)
])
lr_model_pipeline.mlinit()
model_pipeline = Pipeline([(feature_union.name, feature_union),
(lr_model_pipeline.name, lr_model_pipeline)])
model_pipeline.mlinit()
model_pipeline.fit(X_train, y_train)
# Serialize the linear regression model
model_pipeline.serialize_to_bundle('/git/spike', 'scikit-airbnb.lr.zip', init=True)
和数据集 https://s3-us-west-2.amazonaws.com/mleap-demo/datasources/airbnb.csv