我想从数据框中删除一些列,然后应用ML算法。我通过构建2个独立的管道来做到这一点。我的问题是如何将两个管道合并成一个管道?
#######################
from typing import Iterable
import pandas as pd
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
#######################
#Custom Class
#######################
class ColumnDropper_test(Transformer):
def __init__(self, banned_list: Iterable[str]):
super().__init__()
self.banned_list = banned_list
def _transform(self, df: DataFrame) -> DataFrame:
df = df.drop(
*[x for x in df.columns if any(y in x for y in self.banned_list)])
return df
#######################
#Sample Data
#######################
data = pd.DataFrame({
'ball_column': [0, 1, 2, 3],
'keep_column': [7, 8, 9, 10],
'hall_column': [14, 15, 16, 17],
'banned_me': [14, 15, 16, 17],
'target': [21, 31, 41, 51]
})
df = spark.createDataFrame(data)
#######################
# First Pipeline
#######################
column_dropper = ColumnDropper_test(banned_list=['banned_me'])
model = Pipeline(stages=[column_dropper]).fit(df).transform(df)
#######################
#Second Pipeline(Question: Add the block of code below to the above pipeline)
#########################
ready = [col for col in model.columns if col != 'target']
assembler = VectorAssembler(inputCols=ready, outputCol='features')
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='target')
model_2 = Pipeline(stages=[assembler,dtc])
train_data, test_data = model.randomSplit([0.5,0.5])
fit_model = model_2.fit(train_data)
results = fit_model.transform(test_data)
results.select('features','Prediction').show()
我发现的挑战在于上面代码中的变量ready
中。由于调用model.columns
后column_dropper
将有所不同(列数较少),因此使用{df.columns)将其添加到同一管道中将导致以下错误,因为banned_me
具有已被原始数据删除。
#Combining both Pipelines failed attempt
model = Pipeline(stages=[column_dropper,assembler,dtc]).fit(df).transform(df)
调用o188.transform时发生错误。 : java.lang.IllegalArgumentException:字段“ banned_me”不存在。 可用字段:ball_column,keep_column,hall_column,target
我最初的建议是创建一个新类,该类从ColumnDropper_test
类继承df.columns
的新变量。如何使assembler
的{{1}}阶段从Pipeline
阶段进入新的df
,而不是原始的column_dropper
?
答案 0 :(得分:3)
您必须创建一个继承VectorAssembler
的自定义类,以自动设置inputCols
:
from pyspark import keyword_only
class CustomVecssembler(VectorAssembler):
@keyword_only
def __init__(self, outputCol='features'):
super(CustomVecssembler, self).__init__()
self.transformer = VectorAssembler(outputCol=outputCol)
if spark.version.startswith('2.1'):
kwargs = self.__init__._input_kwargs
else:
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self, outputCol='features'):
if spark.version.startswith('2.1'):
kwargs = self.__init__._input_kwargs
else:
kwargs = self._input_kwargs
return self._set(**kwargs)
def _transform(self, df):
ready = [col for col in df.columns if col != 'target']
self.setInputCols(ready)
self.transformer.setInputCols(ready)
df = self.transformer.transform(df)
return df
验证其是否有效:
# prep dataset
data = pd.DataFrame({
'ball_column': [0, 1, 2, 3],
'keep_column': [7, 8, 9, 10],
'hall_column': [14, 15, 16, 17],
'banned_me': [14, 15, 16, 17],
'target': [21, 31, 41, 51]
})
df = spark.createDataFrame(data)
# ORIGINAL IMPLEMENTATION
column_dropper = ColumnDropper_test(banned_list=['banned_me'])
model = Pipeline(stages=[column_dropper]).fit(df).transform(df)
ready = [col for col in model.columns if col != 'target']
assembler = VectorAssembler(inputCols=ready, outputCol='features')
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='target')
model_2 = Pipeline(stages=[assembler, dtc])
train_data, test_data = model.randomSplit([0.5, 0.5])
fit_model = model_2.fit(train_data)
results = fit_model.transform(test_data)
results.select('features','Prediction').show()
# +--------------+----------+
# | features|Prediction|
# +--------------+----------+
# |[1.0,15.0,8.0]| 51.0|
# |[2.0,16.0,9.0]| 51.0|
# +--------------+----------+
# USING CUSTOM VEC ASSEMBLER
new_assembler = CustomVecssembler(outputCol='features')
new_pipeline = Pipeline(stages=[column_dropper, new_assembler, dtc]).fit(train_data)
new_results = new_pipeline.transform(test_data)
new_results.select('features', 'Prediction').show()
# +--------------+----------+
# | features|Prediction|
# +--------------+----------+
# |[1.0,15.0,8.0]| 51.0|
# |[2.0,16.0,9.0]| 51.0|
# +--------------+----------+