Sparklyr:标准洁牙机

时间:2018-11-02 22:09:18

标签: r dplyr data-science sparklyr

我正在尝试使用以下代码来标准化某些功能。实际的标准化似乎效果很好。生成的缩放特征存储在向量列表中。如何将缩放后的要素列表转换为数据框?

sc <- spark_connect(master = "local")
iris_tbl <- sdf_copy_to(sc, iris, name = "iris_tbl", overwrite = TRUE)

features <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width")

scaled <- iris_tbl %>%
    ft_vector_assembler(input_col = features,
                        output_col = "features_temp") %>%
    ft_standard_scaler(input_col = "features_temp",
                        output_col = "features",
                        with_mean = TRUE)
scaled %>% glimpse
Observations: ??
Variables: 7
$ Sepal_Length  <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1,...
$ Sepal_Width   <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3.0, 3.0, 4.0, 4.4, 3.9, 3.5, 3.8, 3.8,...
$ Petal_Length  <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5,...
$ Petal_Width   <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3,...
$ Species       <chr> "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa", "setosa",...
$ features_temp <list> [<5.1, 3.5, 1.4, 0.2>, <4.9, 3.0, 1.4, 0.2>, <4.7, 3.2, 1.3, 0.2>, <4.6, 3.1, 1.5, 0.2>, <5.0, 3.6...
$ features      <list> [<-0.8976739, 1.0156020, -1.3357516, -1.3110521>, <-1.1392005, -0.1315388, -1.3357516, -1.3110521>...

解决方案(使用sdf_separate_column):

一个人可以使用sdf_separate_column函数将向量列表提取到单独的数据帧列中。

sc <- spark_connect(master = "local")
iris_tbl <- sdf_copy_to(sc, iris, name = "iris_tbl", overwrite = TRUE)

features <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width")
features_std <- c("Sepal_Length_std", "Sepal_Width_std", "Petal_Length_std", "Petal_Width_std")

scaled_tbl <- iris_tbl %>%
  ft_vector_assembler(input_col = features,
                      output_col = "features_temp") %>%
  ft_standard_scaler(input_col = "features_temp",
                     output_col = "features",
                     with_mean = TRUE)

scaled_tbl <- scaled_tbl %>%
  sdf_separate_column("features", features_std)

pca_model <- scaled_tbl %>%
  select(features_std) %>%
  ml_pca()

projection <- pca_model %>%
  sdf_project() %>%
  collect()

g <- projection %>%
  ggplot(aes(x = PC1, y = PC2)) + geom_point()
print(g)

0 个答案:

没有答案