Question

我有一个CustomEstimator，它使用MLLib在PySpark中返回一个CustomEstimatorModel。我想存储我的模型元数据。在Scala中，我看到了一个名为“可保存”的界面我也可以在PySpark中使用它吗？

伪代码：

class CustomEstimatorModel(Model, DefaultParamsReadable, DefaultParamsWritable):

    def _transform(self, df):
        huge_df = read_metadata_from_training_data()  # what is the cleanest way to read this in?
        df2 = df.join(huge_df, on=['somekey'], how='left')
        return df2


class CustomEstimator(Estimator, DefaultParamsReadable, DefaultParamsWritable):

    @keyword_only
    def __init__(self):
        super(CustomEstimator, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _fit(self, df):
        huge_df = calculate_metadata_from_training_data(df)
        save(huge_df)  # what is the cleanest way to store this together with all the other components in my pipeline?
        return CustomEstimatorModel()

自定义估算器中的PySpark商店模型元数据

0 个答案: