我需要估算不同组中的一些统计数据。我的实际数据集有超过1000万行。
使用for循环很容易做到。但是,我尝试使用data.table做同样的事情。但是,与for循环相比,它非常慢。
library("data.table")
set.seed(1234) # reproducibility
dt1 <- data.table(id=rep(letters[1:5], rep(10,5)), group=1:10, value=rnorm(100))
# Method 1: using for loops
loopFUN <- function(DT){
Out1 <- list()
idx <- 1
for(i in unique(DT$id)){
for(g in unique(DT$group)){
sub.g <- subset(DT, group==g)
sub.i.g <- subset(sub.g, id==i)
vt <- var.test(x=sub.i.g$value, y=sub.g$value, alternative="greater", ratio=1)
Out1[[idx]] <- c(i, g, vt$statistic, sqrt(vt$estimate),vt$p.value)
idx <- idx + 1
}
}
Out1 <- data.table(do.call(rbind, Out1))
colnames(Out1) <- c("id","group","F.Stat","SD.Ratio","P.value")
return(Out1)
}
# Method 2: using a function inside tada.table
dtFUN <- function(DT){
test <- function(x, y){
VT <- var.test(x, y, alternative="greater", ratio=1)
res <- c(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
return(res)
}
setkey(DT, id, group)
Out2 <- DT[, {x=value; DT[, list(g2=group, vt=test(x, value)), by=group]}, by=.(id,group)]
Out2 <- subset(Out2, group==g2)[, .(id, group, g2=rep(1:3,50), vt)]
Out2 <- dcast(Out2, id + group ~ g2, value.var = "vt")
colnames(Out2) <- c("id","group","F.Stat","SD.Ratio","P.value")
return(Out2)
}
LP <- loopFUN(dt1)
DT <- dtFUN(dt1)
all(LP==DT)
library("microbenchmark")
compare <- microbenchmark(LP <- loopFUN(dt1),
DT <- dtFUN(dt1),
times = 100)
# Results
expr min lq mean median uq max neval cld
LP <- loopFUN(dt1) 42.80655 45.33526 47.35974 46.2085 47.90351 88.42227 100 a
DT <- dtFUN(dt1) 140.41182 145.26643 152.61285 149.7490 154.57031 276.77088 100 b
我可以得到正确的结果。但是我正在努力提高其性能。我想知道是否有人可以给我一些有关如何使其更快的建议。谢谢。
答案 0 :(得分:1)
另一种方法:
dt1[, {
gv <- value
.SD[, {
VT <- var.test(value, gv, alternative="greater", ratio=1)
.(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
},
by=.(id)]
}, by=.(group)]
计时代码(时间太长,无法使用OP的方法,因此我将其取出),另一种方法剥夺了var.test
中的支票(谨慎使用):
mtd2 <- function() {
dt1[, {
gv <- value
leng <- .N
.SD[, {
VT <- var.test(value, gv, alternative="greater", ratio=1)
.(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
},
by=.(id)]
}, by=.(group)]
}
mtd3 <- function() {
dt1[, {
gv <- value
leng <- .N
.SD[, {
#see stats:::var.test.default
ESTIMATE <- var(value) / var(gv)
.(F.Stat=ESTIMATE, P.value=1 - pf(ESTIMATE, .N - 1L, leng - 1L))
},
by=.(id)]
}, by=.(group)][, SD.Ratio:=sqrt(F.Stat)][]
}
library(bench)
bench::mark(mtd2(), mtd3(), check=FALSE)
时间:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <list> <list>
1 mtd2() 2.75s 2.75s 2.75s 2.75s 0.364 1.9GB 54 1 2.75s <data.table [10,000 x 5]> <Rprofmem [82,295 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd3() 1.29s 1.29s 1.29s 1.29s 0.774 13.6MB 4 1 1.29s <data.table [10,000 x 5]> <Rprofmem [2,432 x 3]> <bch:tm> <tibble [1 x 3]>
数据:
library(data.table)
set.seed(1234) # reproducibility
nr <- 1e6
ng <- 100
dt1 <- data.table(id=sample(ng, nr, TRUE),
group=sample(ng, nr, TRUE),
value=rnorm(nr))