MAJOR EDIT澄清答案是错误的
我有一个data.table,包含组列(split_by),键列(key_by)和trait id列(intersect_by)
我想在每个split_by组中,只保留组中所有当前键共享特征id的行。
例如:
dt <- data.table(id = 1:6, key1 = 1, key2 = c(1:2, 2), group_id1= 1, group_id2= c(1:2, 2:1, 1:2), trait_id1 = 1, trait_id2 = 2:1)
setkey(dt, group_id1, group_id2, trait_id1, trait_id2)
dt
id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1: 4 1 1 1 1 1 1
2: 1 1 1 1 1 1 2
3: 5 1 2 1 1 1 2
4: 2 1 2 1 2 1 1
5: 6 1 2 1 2 1 1
6: 3 1 2 1 2 1 2
res <- intersect_this_by(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
我希望res像这样:
> res[]
id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1: 1 1 1 1 1 1 2
2: 5 1 2 1 1 1 2
3: 2 1 2 1 2 1 1
4: 6 1 2 1 2 1 1
5: 3 1 2 1 2 1 2
我们看到id 4已被删除,如group_id1 = 1和group_id2 = 1组合组(id 4所属的组)只有一个组合键(1,1)具有这些特征(1,1)虽然这个组中有两个键组合:(1,1)和(1,2)所以特征(1,1)不被该组中的所有键共享,所以我们从这个组中删除了这个特征,因此drop id 4.相反,id 1和5具有相同的特征,但具有不同的密钥,它们代表该组中的所有密钥((1,1)和(1,2)),因此保留了id 1和5的特征。
在那里给出了实现这一目标的功能:
intersect_this_by2 <- function(dt,
key_by = NULL,
split_by = NULL,
intersect_by = NULL){
dtc <- as.data.table(dt)
# compute number of keys in the group
dtc[, n_keys := uniqueN(.SD), by = split_by, .SDcols = key_by]
# compute number of keys represented by each trait in each group
# and keep row only if they represent all keys from the group
dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by, split_by), .SDcols = key_by]
dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
return(dtc)
}
但是对于大数据集或复杂的特征/键/组来说它变得非常慢...... 真正的data.table有1000万行,特征有30个级别...... 有没有办法改善它?有任何明显的陷阱吗? 谢谢你的帮助
最终编辑: Uwe提出了一个简洁的解决方案,比我的初始代码快40%(我在这里删除了因为它令人困惑) 最终函数如下所示:
intersect_this_by_uwe <- function(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2")){
dti <- copy(dt)
dti[, original_order_id__ := 1:.N]
setkeyv(dti, c(split_by, intersect_by, key_by))
uni <- unique(dti, by = c(split_by, intersect_by, key_by))
unique_keys_by_group <-
unique(uni, by = c(split_by, key_by))[, .N, by = c(split_by)]
unique_keys_by_group_and_trait <-
uni[, .N, by = c(split_by, intersect_by)]
# 1st join to pick group/traits combinations with equal number of unique keys
selected_groups_and_traits <-
unique_keys_by_group_and_trait[unique_keys_by_group,
on = c(split_by, "N"), nomatch = 0L]
# 2nd join to pick records of valid subsets
dti[selected_groups_and_traits, on = c(split_by, intersect_by)][
order(original_order_id__), -c("original_order_id__","N")]
}
对于记录10M行数据集的基准:
> microbenchmark::microbenchmark(old_way = {res <- intersect_this_by(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ new_way = {res <- intersect_this_by2(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ new_way_uwe = {res <- intersect_this_by_uwe(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ times = 10)
Unit: seconds
expr min lq mean median uq max neval cld
old_way 3.145468 3.530898 3.514020 3.544661 3.577814 3.623707 10 b
new_way 15.670487 15.792249 15.948385 15.988003 16.097436 16.206044 10 c
new_way_uwe 1.982503 2.350001 2.320591 2.394206 2.412751 2.436381 10 a
答案 0 :(得分:2)
虽然下面的答案确实重现了小样本数据集的预期结果,但无法为OP提供的大型10 M行数据集提供正确答案。
但是,我已经决定保留这个错误的答案,因为基准测试结果表明uniqueN()
函数的性能很差。此外,答案包含更快,替代解决方案的基准。
如果我理解正确,OP只想保留group_id1
,group_id2
,trait_id1
和trait_id2
的唯一组合出现在多个行中的行不同的key1
。
这可以通过计算每个key1
,group_id1
,group_id2
和trait_id1
组中trait_id2
的唯一值来实现,并且只选择计数大于1的group_id1
,group_id2
,trait_id1
和trait_id2
的组合。最后,通过加入来检索匹配的行:
library(data.table)
sel <- dt[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1]
sel
group_id1 group_id2 trait_id1 trait_id2 V1 1: 1 2 3 1 2 2: 2 2 2 1 2 3: 2 1 1 2 2 4: 1 1 1 1 2 5: 1 1 2 2 2 6: 2 2 2 2 2 7: 1 1 1 2 2 8: 1 1 3 2 2
res <- dt[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][order(id), -"V1"]
res
id key1 group_id1 trait_id1 group_id2 trait_id2 extra 1: 1 2 1 3 2 1 u 2: 2 1 2 2 2 1 g 3: 5 2 2 1 1 2 g 4: 8 2 1 3 2 1 o 5: 9 2 1 1 1 1 d 6: 10 2 2 1 1 2 g 7: 13 1 2 1 1 2 c 8: 14 2 1 2 1 2 t 9: 15 1 1 3 2 1 y 10: 16 2 1 3 2 1 v 11: 19 2 2 2 2 2 y 12: 22 2 2 2 2 1 g 13: 24 2 1 1 1 2 i 14: 25 1 1 3 1 2 n 15: 26 1 2 2 2 2 y 16: 27 1 1 1 1 1 n 17: 28 1 1 1 1 2 h 18: 29 1 2 2 2 2 b 19: 30 2 1 3 1 2 k 20: 31 1 2 2 2 2 w 21: 35 1 1 2 1 2 q 22: 37 2 2 1 1 2 r 23: 39 1 1 1 1 2 o id key1 group_id1 trait_id1 group_id2 trait_id2 extra
这会重现OP的预期结果,但它也是OP要求的最快方式吗?
OP's code用于创建基准数据(但是使用1M行而不是10 M行):
set.seed(0)
n <- 1e6
p <- 1e5
m <- 5
dt <- data.table(id = 1:n,
key1 = sample(1:m, size = n, replace = TRUE),
group_id1 = sample(1:2, size = n, replace = TRUE),
trait_id1 = sample(1:p, size = n, replace = TRUE),
group_id2 = sample(1:2, size = n, replace = TRUE),
trait_id2 = sample(1:2, size = n, replace = TRUE),
extra = sample(letters, n, replace = TRUE))
我很惊讶地发现使用uniqueN()
的解决方案不是最快的解决方案:
Unit: milliseconds expr min lq mean median uq max neval cld old_way 489.4606 496.3801 523.3361 503.2997 540.2739 577.2482 3 a new_way 9356.4131 9444.5698 9567.4035 9532.7265 9672.8987 9813.0710 3 c uwe1 5946.4533 5996.7388 6016.8266 6047.0243 6052.0133 6057.0023 3 b
基准代码:
microbenchmark::microbenchmark(
old_way = {
DT <- copy(dt)
res <- intersect_this_by(DT,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
},
new_way = {
DT <- copy(dt)
res <- intersect_this_by2(DT,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
},
uwe1 = {
DT <- copy(dt)
sel <- DT[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
times = 3L)
请注意,对于每次运行,都会使用基准数据的新副本,以避免先前运行产生的任何副作用,例如data.table
设置的索引。
在
上切换详细模式options(datatable.verbose = TRUE)
显示大多数时间用于计算所有群组的uniqueN()
:
sel <- DT[, uniqueN(key1), by = .(group_id1, group_id2, trait_id1, trait_id2)][V1 > 1] Detected that j uses these columns: key1 Finding groups using forderv ... 0.060sec Finding group sizes from the positions (can be avoided to save RAM) ... 0.000sec Getting back original order ... 0.050sec lapply optimization is on, j unchanged as 'uniqueN(key1)' GForce is on, left j unchanged Old mean optimization is on, left j unchanged. Making each group and running j (GForce FALSE) ... collecting discontiguous groups took 0.084s for 570942 groups eval(j) took 5.505s for 570942 calls 5.940sec
这是a known issue。但是,替代lenght(unique())
(uniqueN()
是缩写)只会带来2的适度加速。
所以我开始寻找避免uniqueN()
或lenght(unique())
的方法。
我找到了两个足够快的替代品。两者都在第一个创建group_id1
,group_id2
,trait_id1
,trait_id2
,和 key1
的唯一组合的data.table步骤,计算每组key1
,group_id1
,group_id2
,trait_id1
的不同trait_id2
值的数量,并筛选大于1的计数:
sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
和
sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
详细输出显示这些变体的计算时间明显更好。
对于基准测试,仅使用最快的方法,但现在使用10 M行。此外,每个变体都会尝试setkey()
和setorder()
,事先应用:
microbenchmark::microbenchmark(
old_way = {
DT <- copy(dt)
res <- intersect_this_by(DT,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
},
uwe3 = {
DT <- copy(dt)
sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
uwe3k = {
DT <- copy(dt)
setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
uwe3o = {
DT <- copy(dt)
setorder(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
sel <- DT[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2, key1)][
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
uwe4 = {
DT <- copy(dt)
sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
uwe4k = {
DT <- copy(dt)
setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
uwe4o = {
DT <- copy(dt)
setorder(DT, group_id1, group_id2, trait_id1, trait_id2, key1)
sel <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))[
, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)][N > 1]
res <- DT[sel, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id)]
},
times = 3L)
10 M案例的基准测试结果表明,这两种变体都比OP的intersect_this_by()
功能更快,并且键控和排序正在推动加速(订购的最小优势)。
Unit: seconds expr min lq mean median uq max neval cld old_way 7.173517 7.198064 7.256211 7.222612 7.297559 7.372506 3 d uwe3 6.820324 6.833151 6.878777 6.845978 6.908003 6.970029 3 c uwe3k 5.349949 5.412018 5.436806 5.474086 5.480234 5.486381 3 a uwe3o 5.423440 5.432562 5.467376 5.441683 5.489344 5.537006 3 a uwe4 6.270724 6.276757 6.301774 6.282790 6.317299 6.351807 3 b uwe4k 5.280763 5.295251 5.418803 5.309739 5.487823 5.665906 3 a uwe4o 4.921627 5.095762 5.157010 5.269898 5.274702 5.279506 3 a
答案 1 :(得分:2)
使用additional explanations by the OP,我相信已经对问题有了更好的理解。
OP希望从他的数据集中删除不完整的子集。每个group_id1
,group_id2
组都包含一组唯一的key1
值。完整子集包含至少一个group_id1
,group_id2
,trait_id1
,trait_id2
,key1
记录每个 {{ 1}} key1
,group_id1
组中的值。
在比较group_id2
,key1
,{{group_id1
上的分组时检查group_id2
值是 1}},trait_id1
级别与trait_id2
,group_id1
级别。检查不同group_id2
值的数量是否相等就足够了。
因此,下面的解决方案遵循OP's own answer的大纲,但使用两个连接来实现结果:
key1
可以验证结果与OP的结果相同:
setkey(dt, group_id1, group_id2, trait_id1, trait_id2, key1)
uni <- unique(dt, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1"))
unique_keys_by_group <-
unique(uni, by = c("group_id1", "group_id2", "key1"))[, .N, by = .(group_id1, group_id2)]
unique_keys_by_group_and_trait <-
uni[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)]
# 1st join to pick group/traits combinations with equal number of unique keys
selected_groups_and_traits <-
unique_keys_by_group_and_trait[unique_keys_by_group,
on = .(group_id1, group_id2, N), nomatch = 0L]
# 2nd join to pick records of valid subsets
res <- dt[selected_groups_and_traits, on = .(group_id1, group_id2, trait_id1, trait_id2)][
order(id), -"N"]
identical( intersect_this_by(dt, key_by = c("key1"), split_by = c("group_id1", "group_id2"), intersect_by = c("trait_id1", "trait_id2")), res)
请注意,由于the benchmarks of my first (wrong) answer所示的性能问题,导致[1] TRUE
功能未。
使用OP的基准数据(10 M行)。
uniqueN()
此处提供的解决方案速度提高了40%:
library(microbenchmark) mb <- microbenchmark( old_way = { DT <- copy(dt) res <- intersect_this_by(DT, key_by = c("key1"), split_by = c("group_id1", "group_id2"), intersect_by = c("trait_id1", "trait_id2")) }, uwe = { DT <- copy(dt) setkey(DT, group_id1, group_id2, trait_id1, trait_id2, key1) uni <- unique(DT, by = c("group_id1", "group_id2", "trait_id1", "trait_id2", "key1")) unique_keys_by_group <- unique(uni, by = c("group_id1", "group_id2", "key1"))[ , .N, by = .(group_id1, group_id2)] unique_keys_by_group_and_trait <- uni[, .N, by = .(group_id1, group_id2, trait_id1, trait_id2)] selected_groups_and_traits <- unique_keys_by_group_and_trait[unique_keys_by_group, on = .(group_id1, group_id2, N), nomatch = 0L] res <- DT[selected_groups_and_traits, on = .(group_id1, group_id2, trait_id1, trait_id2)][ order(id), -"N"] }, times = 3L) mb
Op has asked有关进一步改善效果的想法。
我已经尝试了不同的方法,包括双嵌套分组(使用慢Unit: seconds
expr min lq mean median uq max neval cld
old_way 7.251277 7.315796 7.350636 7.380316 7.400315 7.420315 3 b
uwe 4.379781 4.461368 4.546267 4.542955 4.629510 4.716065 3 a
只是为了简化代码显示):
uniqueN()
但是对于给定的基准数据,它们都比较慢。
特定方法的性能可能不仅仅取决于问题大小,即行数,而且还取决于问题结构,例如,不同组,对待和键的数量以及数据类型等。
因此,在不知道生产数据的结构和计算流程的上下文的情况下,我认为花费更多时间进行基准测试并不值得。
无论如何,有一个建议:确保res <- DT[, {
nuk_g = uniqueN(key1)
.SD[, if(nuk_g == uniqueN(key1)) .SD, by = .(trait_id1, trait_id2)]
}, by = .(group_id1, group_id2)][order(id)]
仅被调用一次,因为它相当昂贵(大约2秒),但加快了所有后续操作。 (与setkey()
核实。)
答案 2 :(得分:1)
我将采用tidyverse
方法开始,并在data.table
中显示相应内容。
请告诉我这个结果是否有意,因为它与您要求的输出有所不同 - 但它是您在文字中描述的内容。
只需从特征中创建单个列,然后按分组列和新的组合特征进行分组。过滤组频率大于1。
dt %>%
mutate(comb = paste0(trait_id1, trait_id2)) %>%
group_by(group_id1, group_id2, comb) %>%
filter(n() > 1)
与data.table
中编写的先前整洁方法的方法大致相同。
使用here的答案找到快速粘贴方法。
dt[, comb := do.call(paste, c(.SD, sep = "")), .SDcols = c("trait_id1", "trait_id2")][, freq := .N, by = .(group_id1, group_id2, comb)][freq > 1]
比较两种方法,Chinsoons评论速度是:
microbenchmark::microbenchmark(zac_tidy = {
dt %>%
mutate(comb = paste0(trait_id1, trait_id2)) %>%
group_by(group_id1, group_id2, comb) %>%
filter(n() > 1)
},
zac_dt = {
dt[, comb := do.call(paste, c(.SD, sep = "")), .SDcols = c("trait_id1", "trait_id2")][, freq := .N, by = .(group_id1, group_id2, comb)][freq > 1]
},
chin_dt = {
dt[id %in% dt[, .SD[, if (.N > 1) id, by=.(trait_id1, trait_id2)], by=.(group_id1, group_id2)]$V1]
}, times = 100)
Unit: milliseconds
expr min lq mean median uq max neval
zac_tidy 4.151115 4.677328 6.150869 5.552710 7.765968 8.886388 100
zac_dt 1.965013 2.201499 2.829999 2.640686 3.507516 3.831240 100
chin_dt 4.567210 5.217439 6.972013 7.330628 8.233379 12.807005 100
> identical(zac_dt, chin_dt)
[1] TRUE
10次重复:
Unit: milliseconds
expr min lq mean median uq max neval
zac_tidy 12.492261 14.169898 15.658218 14.680287 16.31024 22.062874 10
zac_dt 10.169312 10.967292 12.425121 11.402416 12.23311 21.036535 10
chin_dt 6.381693 6.793939 8.449424 8.033886 9.78187 12.005604 10
chin_dt2 5.536246 6.888020 7.914103 8.310142 8.74655 9.600121 10
因此,我推荐使用Chinsoon的方法。要么有效。
答案 3 :(得分:1)
其他答案并没有解决问题,但我找到了一些灵感来自它的方法。首先计算组中存在的键的数量,并且对于每个特征组合,仅保留具有全部键数的键
intersect_this_by2 <- function(dt,
key_by = NULL,
split_by = NULL,
intersect_by = NULL){
if (is.null(intersect_by) |
is.null(key_by) |
!is.data.frame(dt) |
nrow(dt) == 0) {
return(dt)
}
data_table_input <- is.data.table(dt)
dtc <- as.data.table(dt)
if (!is.null(split_by)) {
# compute number of keys in the group
dtc[, n_keys := uniqueN(.SD), by = split_by, .SDcols = key_by]
# compute number of keys represented by each trait in each group
# and keep row only if they represent all keys from the group
dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by, split_by), .SDcols = key_by]
dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
} else {
dtc[, n_keys := uniqueN(.SD), .SDcols = key_by]
dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by), .SDcols = key_by]
dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
}
if (!data_table_input) {
return(as.data.frame(dtc))
} else {
return(dtc)
}
}
问题在于它在我的真实数据集上要慢得多(慢5-6倍),但我认为这个功能有助于更好地理解问题。还有一个更接近我真实数据集的数据集定义如下:
pacman::p_load(data.table, microbenchmark, testthat)
set.seed(0)
n <- 1e7
p <- 1e5
m <- 5
dt <- data.table(id = 1:n,
key1 = sample(1:m, size = n, replace = TRUE),
group_id1 = sample(1:2, size = n, replace = TRUE),
trait_id1 = sample(1:p, size = n, replace = TRUE),
group_id2 = sample(1:2, size = n, replace = TRUE),
trait_id2 = sample(1:2, size = n, replace = TRUE),
extra = sample(letters, n, replace = TRUE))
microbenchmark::microbenchmark(old_way = {res <- intersect_this_by(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))},
new_way = {res <- intersect_this_by2(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))},
times = 1)
Unit: seconds
expr min lq mean median uq max neval
old_way 5.891489 5.891489 5.891489 5.891489 5.891489 5.891489 1
new_way 18.455860 18.455860 18.455860 18.455860 18.455860 18.455860 1
有关信息,此示例中的res行数为
> set.seed(0)
> n <- 1e7
> p <- 1e5
> m <- 5
> dt <- data.table(id = 1:n,
key1 = sample(1:m, size = n, replace = TRUE),
group_id1 = sample(1:2, size = n, replace = TRUE),
trait_id1 = sample(1:p, size = n, replace = TRUE),
group_id2 = sample(1:2, size = n, replace = TRUE),
trait_id2 = sample(1:2, size = n, replace = TRUE),
extra = sample(letters, n, replace = TRUE))
> res <- intersect_this_by(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
> nrow(res)
[1] 7099860
> res <- intersect_this_by2(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
> nrow(res)
[1] 7099860