如何在使用sdf_pivot()函数

时间:2018-02-07 20:37:40

标签: r apache-spark dplyr sparkr sparklyr

我正在尝试使用sdf_pivot()函数dcast我的spark数据帧。我想要 从reshape2包中显示dcast()中value.var参数等列的值。请看下面的例子。

id <- c(1,1,1,1,1,2,2,2,3,3,3)
name <- c("A","B","C","D","E","A","B","C","D","E","F")
value <- c(1,2,3,1,1,2,3,1,1,2,3)
dt <- data.frame(id,name,value)
reshape2::dcast(dt,id~name,value.var = "value")

output1-

  id  A  B  C  D  E  F
1  1  1  2  3  1  1 NA
2  2  2  3  1 NA NA NA
3  3 NA NA NA  1  2  3

spark_dt <- copy_to(sc, dt)
sdf_pivot(spark_dt,id~name)

output2-

id     A     B     C     D     E     F
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     1     1     1   NaN
2     3   NaN   NaN   NaN     1     1     1
3     2     1     1     1   NaN   NaN   NaN

似乎我们在sdf_pivot()函数中没有value.var参数。 我是新来的火花,任何建议将不胜感激。 我需要编写自定义函数吗?

注** - 我试过了

##Pivoting
cohort_paste <- function(gdf) {
  expr <- invoke_static(
    sc,
    "org.apache.spark.sql.functions",
    "paste",
    "value"
  )
  gdf %>% invoke("agg", expr, list())
}

给出错误

  

错误:java.lang.IllegalArgumentException:无效的方法粘贴   object org.apache.spark.sql.functions

我实际上想要使用paste函数。

尝试使用数值列
df <- tibble(
    id = c(rep(1, 9), rep(2, 9)),
    name = rep(rep(c("A", "B", "C"), each=3), 2),
    value = sample(10,18,replace=T)
)[sample(1:18, size=10), ]

spark_dt <- copy_to(sc, df, overwrite=TRUE)

collect_list <- function(gdf) {
    expr <- invoke_static(
        sc,
        "org.apache.spark.sql.functions",
        "collect_list",
        "value"
    )
    gdf %>% invoke("agg", expr, list())
}

sdf_pivot(spark_dt, id ~ name, fun.aggregate=collect_list) %>% 
    mutate_at(vars(-id), funs(concat_ws(" ", .)))

错误日志 -

  

错误:org.apache.spark.sql.AnalysisException:无法解析   &#39; concat_ws(&#39;&#39;,sparklyr_tmp_79e15abf584。A)&#39;由于数据类型   不匹配:参数2需要(数组或字符串)类型,但是,   &#39; {sparklyr_tmp_79e15abf584 {1}}&#39;是数组类型。第1行pos 13;   &#39; GlobalLimit 10   + - &#39; LocalLimit 10 + - &#39;项目[id#3038,concat_ws(,A#3156)AS#3172,concat_ws(,B#3158)AS B#3173,concat_ws(,C#3160) )AS   C#3174]         + - SubqueryAlias sparklyr_tmp_79e15abf584            + - 聚合[id#3038],[id#3038,collect_list(if((name#3039 = A))value#3040 else cast(null as int),0,0)AS A#3156,collect_list(if(if) (名称#3039 = B))值#3040 else cast(null为int),   0,0)AS B#3158,collect_list(if((name#3039 = C))value#3040 else   cast(null as int),0,0)AS C#3160]               + - 项目[id#3038,名称#3039,价值#3040]                  + - SubqueryAlias df                     + - 关系[id#3038,名称#3039,值#3040] csv

1 个答案:

答案 0 :(得分:1)

这失败了,因为paste不是Spark函数,你不能在这个上下文中执行R代码。

您可以尝试这样的事情:

library(dplyr)
library(sparklyr)

sc <- spark_connect("local[8]")
set.seed(1)

df <- tibble(
  id = c(rep(1, 9), rep(2, 9)),
  name = rep(rep(c("A", "B", "C"), each=3), 2),
  value = sample(letters, size=18)
)[sample(1:18, size=10), ]

spark_dt <- copy_to(sc, df, overwrite=TRUE)

collect_list <- function(gdf) {
  expr <- invoke_static(
    sc,
    "org.apache.spark.sql.functions",
    "collect_list",
    "value"
  )
  gdf %>% invoke("agg", expr, list())
}

sdf_pivot(spark_dt, id ~ name, fun.aggregate=collect_list) %>% 
  mutate_at(vars(-id), funs(concat_ws(" ", .)))

#  # Source:   lazy query [?? x 4]
#  # Database: spark_connection
#       id A     B     C    
#    <dbl> <chr> <chr> <chr>
#  1  1.00 j g   u e   w    
#  2  2.00 b c   v x   f  

您还可以使用窗口功能:

first <- function(gdf) {
  expr <- invoke_static(
    sc,
    "org.apache.spark.sql.functions",
    "first",
    "value"
  )
  gdf %>% invoke("agg", expr, list())
}


spark_dt %>% 
  group_by(id, name) %>% 
  arrange(value) %>% 
  mutate(i = row_number()) %>% 
  mutate(name = concat_ws("_", name,  i)) %>% 
  select(-i) %>% sdf_pivot(id ~ name, first)

# # Source:   table<sparklyr_tmp_1ba404d8f51> [?? x 8]
# # Database: spark_connection
#      id A_1   A_2   A_3   B_1   B_2   B_3   C_1  
#   <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1  1.00 m     NA    NA    f     n     v     d    
# 2  2.00 b     x     y     h     r     NA    NA