R data.table:按键和组按多列交叉data.table的最快方法是什么

时间:2018-05-30 00:15:28

标签: r merge data.table

MAJOR EDIT澄清答案是错误的

我有一个data.table,包含组列(split_by),键列(key_by)和trait id列(intersect_by)

我想在每个split_by组中,只保留组中所有当前键共享特征id的行。

例如:

dt <- data.table(id = 1:6, key1 = 1, key2 = c(1:2, 2), group_id1= 1, group_id2= c(1:2, 2:1, 1:2), trait_id1 = 1, trait_id2 = 2:1)
setkey(dt, group_id1, group_id2, trait_id1, trait_id2)
dt
   id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1:  4    1    1         1         1         1         1
2:  1    1    1         1         1         1         2
3:  5    1    2         1         1         1         2
4:  2    1    2         1         2         1         1
5:  6    1    2         1         2         1         1
6:  3    1    2         1         2         1         2

res <- intersect_this_by(dt,
                         key_by = c("key1"),
                         split_by = c("group_id1", "group_id2"),
                         intersect_by = c("trait_id1", "trait_id2"))

我希望res像这样:

> res[]
   id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1:  1    1    1         1         1         1         2
2:  5    1    2         1         1         1         2
3:  2    1    2         1         2         1         1
4:  6    1    2         1         2         1         1
5:  3    1    2         1         2         1         2

我们看到id 4已被删除,如group_id1 = 1和group_id2 = 1组合组(id 4所属的组)只有一个组合键(1,1)具有这些特征(1,1)虽然这个组中有两个键组合:(1,1)和(1,2)所以特征(1,1)不被该组中的所有键共享,所以我们从这个组中删除了这个特征,因此drop id 4.相反,id 1和5具有相同的特征,但具有不同的密钥,它们代表该组中的所有密钥((1,1)和(1,2)),因此保留了id 1和5的特征。

在那里给出了实现这一目标的功能:

intersect_this_by2 <- function(dt,
                               key_by = NULL,
                               split_by = NULL,
                               intersect_by = NULL){

    dtc <- as.data.table(dt)       

    # compute number of keys in the group
    dtc[, n_keys := uniqueN(.SD), by = split_by, .SDcols = key_by]
    # compute number of keys represented by each trait in each group 
    # and keep row only if they represent all keys from the group
    dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by, split_by), .SDcols = key_by]
    dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
    return(dtc)      
}

但是对于大数据集或复杂的特征/键/组来说它变得非常慢......  真正的data.table有1000万行,特征有30个级别...... 有没有办法改善它?有任何明显的陷阱吗? 谢谢你的帮助

最终编辑: Uwe提出了一个简洁的解决方案,比我的初始代码快40%(我在这里删除了因为它令人困惑) 最终函数如下所示:

intersect_this_by_uwe <- function(dt,
                                  key_by = c("key1"),
                                  split_by = c("group_id1", "group_id2"),
                                  intersect_by = c("trait_id1", "trait_id2")){
    dti <- copy(dt)
    dti[, original_order_id__ := 1:.N]
    setkeyv(dti, c(split_by, intersect_by, key_by))
    uni <- unique(dti, by = c(split_by, intersect_by, key_by))
    unique_keys_by_group <-
        unique(uni, by = c(split_by, key_by))[, .N, by = c(split_by)]
    unique_keys_by_group_and_trait <-
        uni[, .N, by = c(split_by, intersect_by)]
    # 1st join to pick group/traits combinations with equal number of unique keys
    selected_groups_and_traits <-
        unique_keys_by_group_and_trait[unique_keys_by_group,
                                       on = c(split_by, "N"), nomatch = 0L]
    # 2nd join to pick records of valid subsets
    dti[selected_groups_and_traits, on = c(split_by, intersect_by)][
        order(original_order_id__), -c("original_order_id__","N")]
}

对于记录10M行数据集的基准:

> microbenchmark::microbenchmark(old_way = {res <- intersect_this_by(dt,
+                                                                    key_by = c("key1"),
+                                                                    split_by = c("group_id1", "group_id2"),
+                                                                    intersect_by = c("trait_id1", "trait_id2"))},
+                                new_way = {res <- intersect_this_by2(dt,
+                                                                     key_by = c("key1"),
+                                                                     split_by = c("group_id1", "group_id2"),
+                                                                     intersect_by = c("trait_id1", "trait_id2"))},
+                                new_way_uwe = {res <- intersect_this_by_uwe(dt,
+                                                                            key_by = c("key1"),
+                                                                            split_by = c("group_id1", "group_id2"),
+                                                                            intersect_by = c("trait_id1", "trait_id2"))},
+                                times = 10)
Unit: seconds
        expr       min        lq      mean    median        uq       max neval cld
     old_way  3.145468  3.530898  3.514020  3.544661  3.577814  3.623707    10  b 
     new_way 15.670487 15.792249 15.948385 15.988003 16.097436 16.206044    10   c
 new_way_uwe  1.982503  2.350001  2.320591  2.394206  2.412751  2.436381    10 a  

4 个答案:

答案 0 :(得分:2)

修改

虽然下面的答案确实重现了小样本数据集的预期结果,但无法为OP提供的大型10 M行数据集提供正确答案

但是,我已经决定保留这个错误的答案,因为基准测试结果表明uniqueN()函数的性能很差。此外,答案包含更快,替代解决方案的基准。

如果我理解正确,OP只想保留group_id1group_id2trait_id1trait_id2的唯一组合出现在多个行中的行不同的key1

这可以通过计算每个key1group_id1group_id2trait_id1组中trait_id2的唯一值来实现,并且只选择计数大于1的group_id1group_id2trait_id1trait_id2的组合。最后,通过加入来检索匹配的行:

library(data.table)
sel <- dt[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1]
sel
   group_id1 group_id2 trait_id1 trait_id2 V1
1:         1         2         3         1  2
2:         2         2         2         1  2
3:         2         1         1         2  2
4:         1         1         1         1  2
5:         1         1         2         2  2
6:         2         2         2         2  2
7:         1         1         1         2  2
8:         1         1         3         2  2
res <- dt[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][order(id), -"V1"]
res
    id key1 group_id1 trait_id1 group_id2 trait_id2 extra
 1:  1    2         1         3         2         1     u
 2:  2    1         2         2         2         1     g
 3:  5    2         2         1         1         2     g
 4:  8    2         1         3         2         1     o
 5:  9    2         1         1         1         1     d
 6: 10    2         2         1         1         2     g
 7: 13    1         2         1         1         2     c
 8: 14    2         1         2         1         2     t
 9: 15    1         1         3         2         1     y
10: 16    2         1         3         2         1     v
11: 19    2         2         2         2         2     y
12: 22    2         2         2         2         1     g
13: 24    2         1         1         1         2     i
14: 25    1         1         3         1         2     n
15: 26    1         2         2         2         2     y
16: 27    1         1         1         1         1     n
17: 28    1         1         1         1         2     h
18: 29    1         2         2         2         2     b
19: 30    2         1         3         1         2     k
20: 31    1         2         2         2         2     w
21: 35    1         1         2         1         2     q
22: 37    2         2         1         1         2     r
23: 39    1         1         1         1         2     o
    id key1 group_id1 trait_id1 group_id2 trait_id2 extra

这会重现OP的预期结果,但它也是OP要求的最快方式吗?

基准测试第1部分

OP's code用于创建基准数据(但是使用1M行而不是10 M行):

set.seed(0)
n <- 1e6
p <- 1e5
m <- 5
dt <- data.table(id = 1:n,
                 key1 = sample(1:m, size = n, replace = TRUE),
                 group_id1 = sample(1:2, size = n, replace = TRUE),
                 trait_id1 = sample(1:p, size = n, replace = TRUE),
                 group_id2 = sample(1:2, size = n, replace = TRUE),
                 trait_id2 = sample(1:2, size = n, replace = TRUE),
                 extra = sample(letters, n, replace = TRUE))

我很惊讶地发现使用uniqueN()的解决方案不是最快的解决方案:

Unit: milliseconds
    expr       min        lq      mean    median        uq       max neval cld
 old_way  489.4606  496.3801  523.3361  503.2997  540.2739  577.2482     3 a  
 new_way 9356.4131 9444.5698 9567.4035 9532.7265 9672.8987 9813.0710     3   c
    uwe1 5946.4533 5996.7388 6016.8266 6047.0243 6052.0133 6057.0023     3  b

基准代码:

microbenchmark::microbenchmark(
  old_way = {
    DT <- copy(dt)
    res <- intersect_this_by(DT,
                             key_by = c("key1"),
                             split_by = c("group_id1", "group_id2"),
                             intersect_by = c("trait_id1", "trait_id2"))
  },
  new_way = {
    DT <- copy(dt)
    res <- intersect_this_by2(DT,
                              key_by = c("key1"),
                              split_by = c("group_id1", "group_id2"),
                              intersect_by = c("trait_id1", "trait_id2"))
  },
  uwe1 = {
    DT <- copy(dt)
    sel <- DT[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  times = 3L)

请注意,对于每次运行,都会使用基准数据的新副本,以避免先前运行产生的任何副作用,例如data.table设置的索引。

上切换详细模式
options(datatable.verbose = TRUE)

显示大多数时间用于计算所有群组的uniqueN()

sel <- DT[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1]

Detected that j uses these columns: key1 
Finding groups using forderv ... 0.060sec 
Finding group sizes from the positions (can be avoided to save RAM) ... 0.000sec 
Getting back original order ... 0.050sec 
lapply optimization is on, j unchanged as 'uniqueN(key1)'
GForce is on, left j unchanged
Old mean optimization is on, left j unchanged.
Making each group and running j (GForce FALSE) ... 
  collecting discontiguous groups took 0.084s for 570942 groups
  eval(j) took 5.505s for 570942 calls
5.940sec

这是a known issue。但是,替代lenght(unique())uniqueN()是缩写)只会带来2的适度加速。

所以我开始寻找避免uniqueN()lenght(unique())的方法。

基准测试第2部分

我找到了两个足够快的替代品。两者都在第一个创建group_id1group_id2trait_id1trait_id2 key1的唯一组合的data.table步骤,计算每组key1group_id1group_id2trait_id1的不同trait_id2值的数量,并筛选大于1的计数:

sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
  , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]

sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
  , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]

详细输出显示这些变体的计算时间明显更好。

对于基准测试,仅使用最快的方法,但现在使用10 M行。此外,每个变体都会尝试setkey()setorder(),事先应用:

microbenchmark::microbenchmark(
  old_way = {
    DT <- copy(dt)
    res <- intersect_this_by(DT,
                             key_by = c("key1"),
                             split_by = c("group_id1", "group_id2"),
                             intersect_by = c("trait_id1", "trait_id2"))
  },
  uwe3 = {
    DT <- copy(dt)
    sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  uwe3k = {
    DT <- copy(dt)
    setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
    sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  uwe3o = {
    DT <- copy(dt)
    setorder(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
    sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  uwe4 = {
    DT <- copy(dt)
    sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  uwe4k = {
    DT <- copy(dt)
    setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
    sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  uwe4o = {
    DT <- copy(dt)
    setorder(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
    sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
      , .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
    res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id)]
  },
  times = 3L)

10 M案例的基准测试结果表明,这两种变体都比OP的intersect_this_by()功能更快,并且键控和排序正在推动加速(订购的最小优势)。

Unit: seconds
    expr      min       lq     mean   median       uq      max neval  cld
 old_way 7.173517 7.198064 7.256211 7.222612 7.297559 7.372506     3    d
    uwe3 6.820324 6.833151 6.878777 6.845978 6.908003 6.970029     3   c 
   uwe3k 5.349949 5.412018 5.436806 5.474086 5.480234 5.486381     3 a   
   uwe3o 5.423440 5.432562 5.467376 5.441683 5.489344 5.537006     3 a   
    uwe4 6.270724 6.276757 6.301774 6.282790 6.317299 6.351807     3  b  
   uwe4k 5.280763 5.295251 5.418803 5.309739 5.487823 5.665906     3 a   
   uwe4o 4.921627 5.095762 5.157010 5.269898 5.274702 5.279506     3 a

答案 1 :(得分:2)

使用additional explanations by the OP,我相信已经对问题有了更好的理解。

OP希望从他的数据集中删除不完整的子集。每个group_id1group_id2组都包含一组唯一的key1值。完整子集包含至少一个group_id1group_id2trait_id1trait_id2key1记录每个 {{ 1}} key1group_id1组中的值。

在比较group_id2key1,{{group_id1上的分组时检查group_id2 1}},trait_id1级别与trait_id2group_id1级别。检查不同group_id2值的数量是否相等就足够了。

因此,下面的解决方案遵循OP's own answer的大纲,但使用两个连接来实现结果:

key1

可以验证结果与OP的结果相同:

setkey(dt, group_id1, group_id2, trait_id1, trait_id2, key1)
uni <- unique(dt, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))
unique_keys_by_group <- 
  unique(uni, by = c("group_id1", "group_id2", "key1"))[, .N, by = .(group_id1, group_id2)]
unique_keys_by_group_and_trait <- 
  uni[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)]
# 1st join to pick group/traits combinations with equal number of unique keys
selected_groups_and_traits <- 
  unique_keys_by_group_and_trait[unique_keys_by_group, 
                                 on = .(group_id1, group_id2, N), nomatch = 0L]
# 2nd join to pick records of valid subsets
res <- dt[selected_groups_and_traits, on = .(group_id1, group_id2, trait_id1, trait_id2)][
  order(id), -"N"]
identical(
  intersect_this_by(dt,
                    key_by = c("key1"),
                    split_by = c("group_id1", "group_id2"),
                    intersect_by = c("trait_id1", "trait_id2")),
  res)

请注意,由于the benchmarks of my first (wrong) answer所示的性能问题,导致[1] TRUE 功能

基准比较

使用OP的基准数据(10 M行)。

uniqueN()

此处提供的解决方案速度提高了40%:

library(microbenchmark)
mb <- microbenchmark(
  old_way = {
    DT <- copy(dt)
    res <- intersect_this_by(DT,
                             key_by = c("key1"),
                             split_by = c("group_id1", "group_id2"),
                             intersect_by = c("trait_id1", "trait_id2"))
  },
  uwe = {
    DT <- copy(dt)
    setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
    uni <- 
      unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))
    unique_keys_by_group <- 
      unique(uni, by = c("group_id1", "group_id2", "key1"))[
        , .N, by = .(group_id1, group_id2)]
    unique_keys_by_group_and_trait <- 
      uni[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)]
    selected_groups_and_traits <- 
      unique_keys_by_group_and_trait[unique_keys_by_group, 
                                     on = .(group_id1, group_id2, N), nomatch = 0L]
    res <- DT[selected_groups_and_traits, 
              on = .(group_id1, group_id2, trait_id1, trait_id2)][
      order(id), -"N"]
  },
  times = 3L)
mb

编辑:进一步提升绩效

Op has asked有关进一步改善效果的想法。

我已经尝试了不同的方法,包括双嵌套分组(使用慢Unit: seconds expr min lq mean median uq max neval cld old_way 7.251277 7.315796 7.350636 7.380316 7.400315 7.420315 3 b uwe 4.379781 4.461368 4.546267 4.542955 4.629510 4.716065 3 a 只是为了简化代码显示):

uniqueN()

但是对于给定的基准数据,它们都比较慢。

特定方法的性能可能不仅仅取决于问题大小,即行数,而且还取决于问题结构,例如,不同组,对待和键的数量以及数据类型等。

因此,在不知道生产数据的结构和计算流程的上下文的情况下,我认为花费更多时间进行基准测试并不值得。

无论如何,有一个建议:确保res <- DT[, { nuk_g = uniqueN(key1) .SD[, if(nuk_g == uniqueN(key1)) .SD, by = .(trait_id1, trait_id2)] }, by = .(group_id1, group_id2)][order(id)] 仅被调用一次,因为它相当昂贵(大约2秒),但加快了所有后续操作。 (与setkey()核实。)

答案 2 :(得分:1)

我将采用tidyverse方法开始,并在data.table中显示相应内容。

请告诉我这个结果是否有意,因为它与您要求的输出有所不同 - 但它是您在文字中描述的内容。

1。整洁的方法

只需从特征中创建单个列,然后按分组列和新的组合特征进行分组。过滤组频率大于1。

dt %>%
  mutate(comb = paste0(trait_id1, trait_id2)) %>%
  group_by(group_id1, group_id2, comb) %>%
  filter(n() > 1)

2。 data.table方法

data.table中编写的先前整洁方法的方法大致相同。

使用here的答案找到快速粘贴方法。

dt[, comb := do.call(paste, c(.SD, sep = "")), .SDcols = c("trait_id1", "trait_id2")][, freq := .N, by = .(group_id1, group_id2, comb)][freq > 1]

比较

比较两种方法,Chinsoons评论速度是:

microbenchmark::microbenchmark(zac_tidy = {
  dt %>%
    mutate(comb = paste0(trait_id1, trait_id2)) %>%
    group_by(group_id1, group_id2, comb) %>%
    filter(n() > 1)
},
zac_dt = {
  dt[, comb := do.call(paste, c(.SD, sep = "")), .SDcols = c("trait_id1", "trait_id2")][, freq := .N, by = .(group_id1, group_id2, comb)][freq > 1]
},
chin_dt = {
  dt[id %in% dt[, .SD[, if (.N > 1) id, by=.(trait_id1, trait_id2)], by=.(group_id1, group_id2)]$V1]
}, times = 100)
Unit: milliseconds
     expr      min       lq     mean   median       uq       max neval
 zac_tidy 4.151115 4.677328 6.150869 5.552710 7.765968  8.886388   100
   zac_dt 1.965013 2.201499 2.829999 2.640686 3.507516  3.831240   100
  chin_dt 4.567210 5.217439 6.972013 7.330628 8.233379 12.807005   100

> identical(zac_dt, chin_dt)
[1] TRUE

1000万

的比较

10次重复:

Unit: milliseconds
     expr       min        lq      mean    median       uq       max neval
 zac_tidy 12.492261 14.169898 15.658218 14.680287 16.31024 22.062874    10
   zac_dt 10.169312 10.967292 12.425121 11.402416 12.23311 21.036535    10
  chin_dt  6.381693  6.793939  8.449424  8.033886  9.78187 12.005604    10
 chin_dt2  5.536246  6.888020  7.914103  8.310142  8.74655  9.600121    10
因此,我推荐使用Chinsoon的方法。要么有效。

答案 3 :(得分:1)

其他答案并没有解决问题,但我找到了一些灵​​感来自它的方法。首先计算组中存在的键的数量,并且对于每个特征组合,仅保留具有全部键数的键

 intersect_this_by2 <- function(dt,
         key_by = NULL,
         split_by = NULL,
         intersect_by = NULL){

    if (is.null(intersect_by) |
        is.null(key_by) |
        !is.data.frame(dt) |
        nrow(dt) == 0) {
        return(dt)
    }
    data_table_input <- is.data.table(dt)
    dtc <- as.data.table(dt)

    if (!is.null(split_by)) {
        # compute number of keys in the group
        dtc[, n_keys := uniqueN(.SD), by = split_by, .SDcols = key_by]
        # compute number of keys represented by each trait in each group 
        # and keep row only if they represent all keys from the group
        dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by, split_by), .SDcols = key_by]
        dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
    } else {
        dtc[, n_keys := uniqueN(.SD), .SDcols = key_by]
        dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by), .SDcols = key_by]
        dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
    }
    if (!data_table_input) {
        return(as.data.frame(dtc))
    } else {
        return(dtc)
    }
}

问题在于它在我的真实数据集上要慢得多(慢5-6倍),但我认为这个功能有助于更好地理解问题。还有一个更接近我真实数据集的数据集定义如下:

pacman::p_load(data.table, microbenchmark, testthat)

set.seed(0)
n <- 1e7
p <- 1e5
m <- 5
dt <- data.table(id = 1:n,
                 key1 = sample(1:m, size = n, replace = TRUE),
                 group_id1 = sample(1:2, size = n, replace = TRUE),
                 trait_id1 = sample(1:p, size = n, replace = TRUE),
                 group_id2 = sample(1:2, size = n, replace = TRUE),
                 trait_id2 = sample(1:2, size = n, replace = TRUE),
                 extra = sample(letters, n, replace = TRUE))
microbenchmark::microbenchmark(old_way = {res <- intersect_this_by(dt,
                                                                    key_by = c("key1"),
                                                                    split_by = c("group_id1", "group_id2"),
                                                                    intersect_by = c("trait_id1", "trait_id2"))},
                               new_way = {res <- intersect_this_by2(dt,
                                                                   key_by = c("key1"),
                                                                   split_by = c("group_id1", "group_id2"),
                                                                   intersect_by = c("trait_id1", "trait_id2"))},
                               times = 1)


Unit: seconds
    expr       min        lq      mean    median        uq       max neval
 old_way  5.891489  5.891489  5.891489  5.891489  5.891489  5.891489     1
 new_way 18.455860 18.455860 18.455860 18.455860 18.455860 18.455860     1

有关信息,此示例中的res行数为

> set.seed(0)
> n <- 1e7
> p <- 1e5
> m <- 5
> dt <- data.table(id = 1:n,
                   key1 = sample(1:m, size = n, replace = TRUE),
                   group_id1 = sample(1:2, size = n, replace = TRUE),
                   trait_id1 = sample(1:p, size = n, replace = TRUE),
                   group_id2 = sample(1:2, size = n, replace = TRUE),
                   trait_id2 = sample(1:2, size = n, replace = TRUE),
                   extra = sample(letters, n, replace = TRUE))
> res <- intersect_this_by(dt,
                            key_by = c("key1"),
                            split_by = c("group_id1", "group_id2"),
                            intersect_by = c("trait_id1", "trait_id2"))
> nrow(res)
[1] 7099860
> res <- intersect_this_by2(dt,
                            key_by = c("key1"),
                            split_by = c("group_id1", "group_id2"),
                            intersect_by = c("trait_id1", "trait_id2"))
> nrow(res)
[1] 7099860