Question

如何对嵌入在R？

中的tibble列表列中的字符向量执行函数

这个概念是我合并了2个数据集，两个数据集都包含一个地址的自由文本字段，我想计算每个自由文本字段的常用字数。我将探索模糊逻辑，但也希望找到解决这个问题的方法。

例如，请考虑以下代码：

require(tidyverse)    

df <- tibble(
      x = c("one two three four", "two three four five"), 
      y = c("three four five six", "four five six seven")
    )

df.lst <- df %>% 
  mutate(x.lst = str_extract_all(x, "[:alnum:]+"),
         y.lst = str_extract_all(y, "[:alnum:]+")) %>% 
  glimpse()

# Observations: 2
# Variables: 4
# $ x     <chr> "one two three four", "two three four five"
# $ y     <chr> "three four five six", "four five six seven"
# $ x.lst <list> [<"one", "two", "three", "four">, <"two", "three", "four", "...
# $ y.lst <list> [<"three", "four", "five", "six">, <"four", "five", "six", "...

df.lst %>% 
  mutate(xy.x = intersect(x.lst, y.lst))

#    Error in mutate_impl(.data, dots) : 
 #     Column `xy.x` must be length 2 (the number of rows) or one, not 0

我尝试使用Reduce(intersect...但没有成功。

我考虑设置一个新的tibble，列表列被展平，但我更愿意保留每个记录对1行的结构，因为字符串相对较短。

感谢。

Answer 1

我们可以使用purrr包中的map2来循环使用intersect的两个列表。结果可以存储在新列中（在这种情况下为Compare）。

library(tidyverse)

df.lst2 <- df.lst %>%
  mutate(Compare = map2(x.lst , y.lst, ~intersect(.x, .y)))

df.lst2$Compare
# [[1]]
# [1] "three" "four" 
# 
# [[2]]
# [1] "four" "five"

<强>更新

我们还可以将rowwise与mutate一起用作其他帖子。但对于大型数据帧，rowwise可能会降低intersect函数的性能。在这里，我使用microbenchmark包来评估具有df.lst相同结构的大型数据框（10000行）上的这两种方法。

library(microbenchmark)

# Create a large data frame
df_large <- data.frame(
  ID = 1:10000,
  x = df$x,
  y = df$y,
  stringsAsFactors = FALSE
)

df_large <- df_large %>% 
  select(-ID) %>%
  as.tibble()

df_large.lst <- df_large %>%
  mutate(x.lst = str_extract_all(x, "[:alnum:]+"),
         y.lst = str_extract_all(y, "[:alnum:]+")) %>% 
  glimpse() 
# Observations: 10,000
# Variables: 4
# $ x     <chr> "one two three four", "two three four five", "one two three four", "two three...
# $ y     <chr> "three four five six", "four five six seven", "three four five six", "four fi...
# $ x.lst <list> [<"one", "two", "three", "four">, <"two", "three", "four", "five">, <"one", ...
# $ y.lst <list> [<"three", "four", "five", "six">, <"four", "five", "six", "seven">, <"three...

# Performance Evaluation
perm <- microbenchmark(
  m1 = {df_large.lst2 <- df_large.lst %>%
    mutate(xy.x = map2(x.lst , y.lst, ~intersect(.x, .y)))},
  m2 = {df_large.lst2 <- df_large.lst %>%
    rowwise() %>%
    mutate(xy.x = list(intersect(x.lst, y.lst))) %>%
    ungroup()},
  m3 = {df_large.lst2 <- df_large.lst%>%
      rownames_to_column () %>%
      group_by(rowname) %>%
      mutate(xy.x =list(intersect(unlist(x.lst),unlist(y.lst))))},
  times = 100L
)

perm
# Unit: milliseconds
# expr      min       lq     mean   median       uq      max neval
#   m1 158.8871 171.7935 183.0220 176.3373 191.0863 260.3079   100
#   m2 353.1279 387.1014 405.2522 401.6800 422.6556 459.7453   100
#   m3 436.0175 465.9106 496.4585 481.7983 527.7079 613.0461   100

Answer 2

s=df.lst%>%
     rowwise()%>%
     mutate(xy.x=list(intersect(x.lst,y.lst)))

s$xy.x
[[1]]
[1] "three" "four" 

[[2]]
[1] "four" "five"

您也可以使用group_by

df.lst%>%
    group_by_(names(df.lst))%>%
    mutate(mm=list(intersect(unlist(x.lst),unlist(y.lst))))
s1$mm
[[1]]
[1] "three" "four" 

[[2]]
[1] "four" "five"

如果您觉得某一点可能有两行的数量非常相似，那么请执行以下操作：

 df.lst%>%
    rownames_to_column%>%
    group_by(rowname)%>%
    mutate(mm=list(intersect(unlist(x.lst),unlist(y.lst))))


Now if you do the microbench on the last one as compared to the other two:

perm
Unit: milliseconds
 expr        min         lq       mean     median         uq       max   neval
   m1 333.607625 354.065554 425.308486 374.658775  514.01087  818.6467    100
   m2 810.377360 842.860575 970.846458 878.892835 1074.33373 1329.3056    100
   m3   3.179928   3.323983   4.241713   3.799968    4.49567   20.0653    100

因此，在大型数据集上运行microbench之后，您将看到group_by更快：

转换列表中的列表列，使用＆＃34; intersect＆＃34;在这种情况下

2 个答案: