Question

我以前几乎从未使用过data.table，但是从如此多的线程看来，这似乎是大数据帧的解决方案。

在新列中，我想根据按组按组 （！）中的任何字符串的存在，以给定的顺序分配值：

首先寻找字符串1，然后寻找字符串2，然后寻找字符串3，依此类推...并根据结果分配值。

使用dplyr::case_when查看所需的输出（我仅显示每组的前6行）。它适用于较小的数据帧，但是我的代码冻结在具有数千个组的2.5Mio行数据帧上。

我很惊讶没有找到任何线索，我感到我的问题一定是虚假的。

样本数据

library(data.table) set.seed(1) mydat <- data.table(group = rep(letters[1:26], each = 3), value = sample(c('find','this','string'), 78, replace = TRUE))

所需结果

library(dplyr) des_dat <- mydat %>% group_by(group) %>% mutate(found = case_when(any(value == 'string') ~ 'A', any(value == 'this') ~ 'B', TRUE ~ 'C')) %>% ungroup des_dat %>% group_by(found) %>% slice(1:6) #just to show the first 6 results by group #> # A tibble: 18 x 3 #> # Groups: found [3] #> group value found #> <chr> <chr> <chr> #> 1 a find A #> 2 a string A #> 3 a find A #> 4 b this A #> 5 b find A #> 6 b string A #> 7 e find B #> 8 e find B #> 9 e this B #> 10 f this B #> 11 f this B #> 12 f this B #> 13 h find C #> 14 h find C #> 15 h find C #> 16 x find C #> 17 x find C #> 18 x find C

^{由reprex package（v0.3.0）于2019-11-08创建}

Answer 1

这应该相当快，因为setorder使用基数排序，然后使用二进制搜索来找到字符串，然后通过引用进行更新：

library(data.table)
set.seed(1)
nr <- 2.5e6
ng <- 1e5
mydat <- data.table(group = sample(ng, nr, TRUE), 
    value = sample(c('find','this','string'), nr, TRUE))

system.time({

    setkey(mydat, value)[, found := "C"]

    mydat[group %in% mydat[.("this"), unique(group)], found := "B"][
        group %in% mydat[.("string"), unique(group)], found := "A"]
})
#   user  system elapsed 
#   0.86    0.03    0.52

如果您的小组属于角色类别，请使用%chin%代替%in%。

编辑：实际上，一种更快的方法是使用data.table::fifelse（从1.12.4版开始可用）

DT1[, found := fifelse(any(value=="string"), "A", 
        fifelse(any(value=="this"), "B", "C")), group]

计时代码：

library(data.table)
set.seed(1)
nr <- 2.5e6
ng <- 1e5
mydat <- data.table(group = sample(ng, nr, TRUE), 
    value = sample(c('find','this','string'), nr, TRUE))
DT0 <- copy(mydat)
DT1 <- copy(mydat)

mtd0 <- function() {
    setkey(DT0, value)[, found := "C"]
    DT0[group %in% DT0[.("this"), unique(group)], found := "B"][
        group %in% DT0[.("string"), unique(group)], found := "A"]
}

mtd1 <- function() {
   DT1[, found := fifelse(any(value=="string"), "A", 
            fifelse(any(value=="this"), "B", "C")), group] 
}

bench::mark(mtd0(), mtd1(), check=FALSE)
identical(setorder(mtd0(), group, value), setorder(mtd1(), group, value))
[1] TRUE

ng <- 1e5的时间：

# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result     memory   time  gc     
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>     <list>   <lis> <list> 
1 mtd0()        1.54s    1.54s     0.648   329.8MB     2.59     1     4      1.54s <df[,3] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1()      361.1ms 362.72ms     2.76     33.5MB     0        2     0   725.44ms <df[,3] [~ <df[,3]~ <bch~ <tibbl~

ng <- 1e2的时间：

# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result     memory   time  gc     
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>     <list>   <lis> <list> 
1 mtd0()        1.51s    1.51s     0.662   321.6MB     2.65     1     4      1.51s <df[,3] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1()     120.89ms 466.93ms     2.14     48.1MB     1.07     2     1   933.86ms <df[,3] [~ <df[,3]~ <bch~ <tibbl~

ng <- 2.5e6的时间：

# A tibble: 2 x 13
  expression    min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result      memory     time  gc     
  <bch:expr> <bch:> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>      <list>     <lis> <list> 
1 mtd0()      2.78s   2.78s     0.360   282.2MB    1.08      1     3      2.78s <df[,3] [2~ <df[,3] [~ <bch~ <tibbl~
2 mtd1()      5.12s   5.12s     0.195    64.8MB    0.782     1     4      5.12s <df[,3] [2~ <df[,3] [~ <bch~ <tibbl

Answer 2

我想知道，先使用data.table然后再将merge转换为原始数据来计算汇总表会更有效吗？

# make helper function
fun1 <- function(x) ifelse(max(x == 'string'), 'A', ifelse(max(x == 'this'), 'B', 'C'))
mydat_summary <- mydat[, (.found = fun1(value)), group]
newdat <- merge(mydat, mydat_summary)

Answer 3

您可以使用match和max为“找到”值的向量建立索引：

mydat[ , found := LETTERS[3:1][max(match(value, c("find", "this", "string")))],
       by = group]

all.equal(des_dat$found, mydat$found)
# TRUE

在@ chinsoon12的答案中使用较大的“ mydat”时，速度与其替代方法相似。

还请注意，fcase / case_when function for data.table正在进行中。

dplyr :: case_when当按组在同一列上

3 个答案: