此示例摘自Sparklyr文档
https://spark.rstudio.com/guides/pipelines/
flights_pipeline <- ml_pipeline(sc) %>%
ft_dplyr_transformer(
tbl = df
) %>%
ft_binarizer(
input.col = "dep_delay",
output.col = "delayed",
threshold = 15
) %>%
ft_bucketizer(
input.col = "sched_dep_time",
output.col = "hours",
splits = c(400, 800, 1200, 1600, 2000, 2400)
) %>%
ft_r_formula(delayed ~ month + day + hours + distance) %>%
ml_logistic_regression()
从上面的示例中可以明显看出,管道是线性的,它使用sparklyr的内置转换和仅dplyr函数来处理数据。
有没有一种方法可以在sparklyr管道中使用自定义转换器(例如:在自定义函数中具有for循环)?
答案 0 :(得分:1)
如果您的争斗非常简单,则可以通过ft_sql_transformer
在管道中使用SQL进行争斗。例如,如果要在管道中添加修改列,则可以执行以下操作:
flights_pipeline <- ml_pipeline(sc) %>%
ft_dplyr_transformer(
tbl = df
) %>%
ft_sql_transformer(
"select *, distance + 47 as example from __THIS__") %>%
ft_binarizer(
input_col = "dep_delay",
output_col = "delayed",
threshold = 15
) %>%
ft_bucketizer(
input_col = "sched_dep_time",
output_col = "hours",
splits = c(400, 800, 1200, 1600, 2000, 2400)
) %>%
ft_r_formula(delayed ~ month + day + hours + distance) %>%
ml_logistic_regression()
您可以运行一些limitations这种SQL代码,但是我希望这对您有用。这是我测试过的完整示例。请注意最终表中的修改列。
library(nycflights13)
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local", spark_version = "2.2.0")
## * Using Spark: 2.2.0
spark_flights <- sdf_copy_to(sc, flights)
df <- spark_flights %>%
filter(!is.na(dep_delay)) %>%
mutate(
month = paste0("m", month),
day = paste0("d", day)
) %>%
select(dep_delay, sched_dep_time, month, day, distance)
ft_dplyr_transformer(sc, df)
ft_dplyr_transformer(sc, df) %>%
ml_param("statement")
flights_pipeline <- ml_pipeline(sc) %>%
ft_dplyr_transformer(
tbl = df
) %>%
ft_sql_transformer(
"select *, distance + 47 as example from __THIS__") %>%
ft_binarizer(
input_col = "dep_delay",
output_col = "delayed",
threshold = 15
) %>%
ft_bucketizer(
input_col = "sched_dep_time",
output_col = "hours",
splits = c(400, 800, 1200, 1600, 2000, 2400)
) %>%
ft_r_formula(delayed ~ month + day + hours + distance) %>%
ml_logistic_regression()
flights_pipeline
partitioned_flights <- sdf_partition(
spark_flights,
training = 0.01,
testing = 0.01,
rest = 0.98
)
fitted_pipeline <- ml_fit(
flights_pipeline,
partitioned_flights$training
)
fitted_pipeline
predictions <- ml_transform(
fitted_pipeline,
partitioned_flights$testing
)