我有一个CustomEstimator,它使用MLLib在PySpark中返回一个CustomEstimatorModel。 我想存储我的模型元数据。在Scala中,我看到了一个名为“可保存”的界面 我也可以在PySpark中使用它吗?
伪代码:
class CustomEstimatorModel(Model, DefaultParamsReadable, DefaultParamsWritable):
def _transform(self, df):
huge_df = read_metadata_from_training_data() # what is the cleanest way to read this in?
df2 = df.join(huge_df, on=['somekey'], how='left')
return df2
class CustomEstimator(Estimator, DefaultParamsReadable, DefaultParamsWritable):
@keyword_only
def __init__(self):
super(CustomEstimator, self).__init__()
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
def _fit(self, df):
huge_df = calculate_metadata_from_training_data(df)
save(huge_df) # what is the cleanest way to store this together with all the other components in my pipeline?
return CustomEstimatorModel()