分割下表的最有效(时间和空间相同)方法是什么
dt = data.table(x=c(1,3,5,4,6,2), y=c(4,7,1,1,2,6))
> dt
x y
1: 1 4
2: 3 7
3: 5 1
4: 4 1
5: 6 2
6: 2 6
分为两个单独的表,dt1和dt2,这样dt1包含所有(x,y)行iff(y,x)也是dt中的一行,而dt2包含其他行:
> dt1
x y
1: 1 4
2: 4 1
3: 6 2
4: 2 6
> dt2
x y
1: 3 7
2: 5 1
效率至关重要,全表有近200M行
答案 0 :(得分:5)
另一种选择是对自己执行向后连接
indx <- sort.int(dt[unique(dt), on = c(x = "y", y = "x"), which = TRUE, nomatch = 0L])
dt[indx]
# x y
# 1: 1 4
# 2: 4 1
# 3: 6 2
# 4: 2 6
dt[-indx]
# x y
# 1: 3 7
# 2: 5 1
基准 - 如果您不关心订单,我的解决方案对于200MM行似乎更快(两种解决方案都是无序的)
set.seed(123)
bigdt <- data.table(x = sample(1e3, 2e8, replace = TRUE),
y = sample(1e3, 2e8, replace = TRUE))
system.time(i1 <- bigdt[, .I[.N>1] ,.(X=pmax(x,y), Y=pmin(y,x))]$V1)
# user system elapsed
# 21.81 0.82 22.97
system.time(indx <- bigdt[unique(bigdt), on = c(x = "y", y = "x"), which = TRUE, nomatch = 0L])
# user system elapsed
# 17.74 0.90 18.80
# Checking if both unsorted and if identical when sorted
is.unsorted(i1)
# [1] TRUE
is.unsorted(indx)
# [1] TRUE
identical(sort.int(i1), sort.int(indx))
# [1] TRUE
这是一个非退化案例(indx != bigdt[, .I]
):
set.seed(123)
n = 1e7
nv = 1e4
DT <- data.table(x = sample(nv, n, replace = TRUE), y = sample(nv, n, replace = TRUE))
library(microbenchmark)
microbenchmark(
akrun = {
idx = DT[, .I[.N > 1], by=.(pmax(x,y), pmin(x,y))]$V1
list(DT[idx], DT[-idx])
},
akrun2 = {
idx = DT[,{
x1 <- paste(pmin(x,y), pmax(x,y))
duplicated(x1)|duplicated(x1, fromLast=TRUE)
}]
list(DT[idx], DT[!idx])
},
davida = {
idx = DT[unique(DT), on = c(x = "y", y = "x"), which = TRUE, nomatch = 0L]
list(DT[idx], DT[-idx])
},
akrun3 = {
n = DT[, N := .N, by = .(pmax(x,y), pmin(x,y))]$N
DT[, N := NULL]
split(DT, n > 1L)
}, times = 1)
Unit: seconds
expr min lq mean median uq max neval
akrun 7.056609 7.056609 7.056609 7.056609 7.056609 7.056609 1
akrun2 22.810844 22.810844 22.810844 22.810844 22.810844 22.810844 1
davida 2.738918 2.738918 2.738918 2.738918 2.738918 2.738918 1
akrun3 5.662700 5.662700 5.662700 5.662700 5.662700 5.662700 1
答案 1 :(得分:4)
我们可以试试
CounterText
使用i1 <- dt[, .I[.N>1] ,.(X=pmax(x,y), Y=pmin(y,x))]$V1
dt[i1]
# x y
#1: 1 4
#2: 4 1
#3: 6 2
#4: 2 6
dt[-i1]
# x y
#1: 3 7
#2: 5 1
duplicated
答案 2 :(得分:2)
只是为了关注@David Arenburg和@akrun的答案(绝对不想发布一个新答案,这只是为了评论),我还有时间排序功能:
library(microbenchmark)
library(data.table)
library(dplyr)
set.seed(123)
bigdt <- data.table(x = sample(1e3, 2e8, replace = TRUE),
y = sample(1e3, 2e8, replace = TRUE))
f1 <- function() bigdt[, .I[.N>1] ,.(X=pmax(x,y), Y=pmin(y,x))]$V1
f2 <- function() bigdt[, .I[.N>1] ,.(X=pmax(x,y), Y=pmin(y,x))] %>% setorder(V1) %>% .[, V1]
f3 <- function() bigdt[unique(bigdt), on = c(x = "y", y = "x"), which = TRUE, nomatch = 0L]
f4 <- function() sort.int(bigdt[unique(bigdt), on = c(x = "y", y = "x"), which = TRUE, nomatch = 0L])
res <- microbenchmark(
i1 <- f1(),
i2 <- f2(),
i3 <- f3(),
i4 <- f4(),
times = 2L)
print(res)
使用:
is.unsorted(i1) # TRUE
is.unsorted(i2) # FALSE
is.unsorted(i3) # TRUE
is.unsorted(i4) # FALSE
identical(sort.int(i1), i2) # TRUE
identical(sort.int(i3), i4) # TRUE
identical(i2, i4) # TRUE
结果如下:
Unit: seconds
expr min lq mean median uq max neval cld
i1 <- f1() 21.18695 21.18695 21.42634 21.42634 21.66572 21.66572 2 a
i2 <- f2() 47.16270 47.16270 47.79535 47.79535 48.42799 48.42799 2 b
i3 <- f3() 19.67623 19.67623 20.11365 20.11365 20.55108 20.55108 2 a
i4 <- f4() 57.21732 57.21732 57.78666 57.78666 58.35600 58.35600 2 c
总结:
f3()
因未分类的结果而失败; f2()
会更快。