通过匹配Sparklyr中的字符串来创建新变量

时间:2018-07-31 15:36:42

标签: r apache-spark dplyr sparklyr

我第一次使用sparklyr,在匹配两个向量的字符串以按比例创建新变量时遇到麻烦。我的问题具有以下一般结构:

我有一个大型的网址集:

df_1 <-  data.frame(
  col1 = c(1,2,3,4,5,6,7,8,9,10),
  col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw", 
           "matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"))
col1            col2
 1   john.com/abcd
 2  ringo.com/defg
 3   paul.com/hijk
 4 george.com/lmno
 5    rob.com/pqrs
 6    sam.com/tuvw
 7   matt.com/xyza
 8  lenny.com/bcde
 9    bob.com/fghi
10    tom.com/jklm

以及一般领域的另一个较小的数据集:

df_2 <- data.frame( 
  col1 = c(1,2,3,4,5,6,7),
  col2 = c("john.com", "jake.com", "tim.com", "paul.com", "rob.com", "harry.com", "chris.com"))
col1      col2
    1  john.com
    2  jake.com
    3   tim.com
    4  paul.com
    5   rob.com
    6 harry.com
    7 chris.com

我想使用df_2中的域向量(df_2 $ col2)为df_1创建一个虚拟变量,该变量指示域是否出现在df_1(df_1 $ col_2)的网址内。结果数据帧应类似于df_3。

df_3 <- data.frame(
  col1 = c(1,2,3,4,5,6,7,8,9,10),
  col2 = c("john.com/abcd", "ringo.com/defg", "paul.com/hijk", "george.com/lmno", "rob.com/pqrs", "sam.com/tuvw", 
               "matt.com/xyza", "lenny.com/bcde", "bob.com/fghi", "tom.com/jklm"),
  col3 = c(1,0,1,0,1,0,0,0,0,0))
   col1            col2 col3
     1   john.com/abcd    1
     2  ringo.com/defg    0
     3   paul.com/hijk    1
     4 george.com/lmno    0
     5    rob.com/pqrs    1
     6    sam.com/tuvw    0
     7   matt.com/xyza    0
     8  lenny.com/bcde    0
     9    bob.com/fghi    0
    10    tom.com/jklm    0

我已经阅读了这篇文章:How to filter on partial match using sparklyr

并尝试使用类似

的代码对df_2的每个观察结果进行编码
df_3 <- df_1 %>%
  mutate(col3 = 
    ifelse(like(df_1$col2, "john.com") | df_1$col2, "jake.com" | etc.,1,0))

但是到目前为止,我一直遇到堆栈限制或R无法识别类似函数的问题。必须有一个更简单的方法来执行此操作。感谢您的任何帮助。

1 个答案:

答案 0 :(得分:0)

如果您正在寻找一个定义良好的前缀(如此处),则可以将其提取:

sdf_1 <- copy_to(sc, df_1)
sdf_2 <- copy_to(sc, df_2)

sdf_1_keyed <- sdf_1 %>% mutate(key = regexp_extract(col2, "^(.*)/", 1))

应用左等参:

matched <- sdf_1_keyed %>% 
  left_join(sdf_2 %>% transmute(key = col2, id = col1), by="key")

总结

matched %>% group_by(col1, col2) %>% 
   summarise(col3 = as.numeric(sum(as.numeric(!is.na(id)), na.rm = TRUE) > 0))
# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups:   col1
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     5 rob.com/pqrs        1
 3     6 sam.com/tuvw        0
 4     9 bob.com/fghi        0
 5     3 paul.com/hijk       1
 6     4 george.com/lmno     0
 7     8 lenny.com/bcde      0
 8    10 tom.com/jklm        0
 9     2 ringo.com/defg      0
10     7 matt.com/xyza       0
# ... with more rows

RLIKE条件下可以完成类似的事情:

candidates <- sdf_1 %>% spark_dataframe() %>% 
  sparklyr::invoke("crossJoin",
    sdf_2 %>% transmute(target = col2) %>% spark_dataframe()) %>% 
  sdf_register()

candidates %>% 
    mutate(matched = as.numeric(rlike(col2, target))) %>% 
    group_by(col1, col2) %>% 
    summarise(col3 = as.numeric(sum(matched, na.rm=TRUE) > 0))
# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups:   col1
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     5 rob.com/pqrs        1
 3     6 sam.com/tuvw        0
 4     9 bob.com/fghi        0
 5     3 paul.com/hijk       1
 6     4 george.com/lmno     0
 7     8 lenny.com/bcde      0
 8    10 tom.com/jklm        0
 9     2 ringo.com/defg      0
10     7 matt.com/xyza       0
# ... with more rows

最后,您可以提取唯一值:

targets <- unique(as.character(df_2$col2))

并创建SQL表达式:

library(glue)

expr <- glue_collapse(glue("col2 rlike '{targets}'"), " OR ")

sdf_1 %>% 
  spark_dataframe() %>%
  sparklyr::invoke(
    "selectExpr", 
    list("*", as.character(glue("{expr} as col3")))) %>% 
  sdf_register() %>%
  mutate(col3 = as.numeric(col3))
 # Source:   lazy query [?? x 3]
 # Database: spark_connection
     col1 col2             col3
    <dbl> <chr>           <dbl>
  1     1 john.com/abcd       1
  2     2 ringo.com/defg      0
  3     3 paul.com/hijk       1
  4     4 george.com/lmno     0
  5     5 rob.com/pqrs        1
  6     6 sam.com/tuvw        0
  7     7 matt.com/xyza       0
  8     8 lenny.com/bcde      0
  9     9 bob.com/fghi        0
 10    10 tom.com/jklm        0
 # ... with more rows

或R表达式:

library(rlang)

rexpr <- glue_collapse(glue("rlike(col2, '{targets}')"), " | ")

sdf_1 %>% mutate(col3 = !!parse_quosure(glue("as.numeric({rexpr})")))
# Source:   lazy query [?? x 3]
# Database: spark_connection
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     2 ringo.com/defg      0
 3     3 paul.com/hijk       1
 4     4 george.com/lmno     0
 5     5 rob.com/pqrs        1
 6     6 sam.com/tuvw        0
 7     7 matt.com/xyza       0
 8     8 lenny.com/bcde      0
 9     9 bob.com/fghi        0
10    10 tom.com/jklm        0
# ... with more rows