Question

我需要估算不同组中的一些统计数据。我的实际数据集有超过1000万行。

使用for循环很容易做到。但是，我尝试使用data.table做同样的事情。但是，与for循环相比，它非常慢。

library("data.table")
set.seed(1234) # reproducibility
dt1 <- data.table(id=rep(letters[1:5], rep(10,5)), group=1:10, value=rnorm(100))

# Method 1: using for loops
loopFUN <- function(DT){
  Out1 <- list()
  idx <- 1
  for(i in unique(DT$id)){
    for(g in unique(DT$group)){
      sub.g <- subset(DT, group==g)
      sub.i.g <- subset(sub.g, id==i)
      vt <- var.test(x=sub.i.g$value, y=sub.g$value, alternative="greater", ratio=1)
      Out1[[idx]] <- c(i, g, vt$statistic, sqrt(vt$estimate),vt$p.value)
      idx <- idx + 1
    }
  }
  Out1 <- data.table(do.call(rbind, Out1))
  colnames(Out1) <- c("id","group","F.Stat","SD.Ratio","P.value")
  return(Out1)
}

# Method 2: using a function inside tada.table
dtFUN <- function(DT){
  test <- function(x, y){
    VT <- var.test(x, y, alternative="greater", ratio=1)
    res <- c(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
    return(res)
  }
  setkey(DT, id, group)
  Out2 <- DT[, {x=value; DT[, list(g2=group, vt=test(x, value)), by=group]}, by=.(id,group)]
  Out2 <- subset(Out2, group==g2)[, .(id, group, g2=rep(1:3,50), vt)]
  Out2 <- dcast(Out2, id + group ~ g2, value.var = "vt")
  colnames(Out2) <- c("id","group","F.Stat","SD.Ratio","P.value")
  return(Out2)
}


LP <- loopFUN(dt1)
DT <- dtFUN(dt1)
all(LP==DT)

library("microbenchmark")
compare <- microbenchmark(LP <- loopFUN(dt1),
                          DT <- dtFUN(dt1),
                          times = 100)

# Results
              expr       min        lq      mean   median        uq       max neval cld
LP <- loopFUN(dt1)  42.80655  45.33526  47.35974  46.2085  47.90351  88.42227   100  a 
DT <- dtFUN(dt1)   140.41182 145.26643 152.61285 149.7490 154.57031 276.77088   100   b

我可以得到正确的结果。但是我正在努力提高其性能。我想知道是否有人可以给我一些有关如何使其更快的建议。谢谢。

Answer 1

另一种方法：

dt1[, { 
        gv <- value
        .SD[, {
                VT <- var.test(value, gv, alternative="greater", ratio=1)
                .(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
            }, 
            by=.(id)]
    }, by=.(group)]

计时代码（时间太长，无法使用OP的方法，因此我将其取出），另一种方法剥夺了var.test中的支票（谨慎使用）：

mtd2 <- function() {
    dt1[, { 
            gv <- value
            leng <- .N
            .SD[, {
                    VT <- var.test(value, gv, alternative="greater", ratio=1)
                    .(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
                }, 
                by=.(id)]
        }, by=.(group)]
}

mtd3 <- function() {
    dt1[, { 
        gv <- value
        leng <- .N
        .SD[, {
            #see stats:::var.test.default
            ESTIMATE <- var(value) / var(gv)
            .(F.Stat=ESTIMATE, P.value=1 - pf(ESTIMATE, .N - 1L, leng - 1L))
        }, 
            by=.(id)]
    }, by=.(group)][, SD.Ratio:=sqrt(F.Stat)][]
}

library(bench)
bench::mark(mtd2(), mtd3(), check=FALSE)

时间：

# A tibble: 2 x 14
  expression      min     mean   median      max `itr/sec` mem_alloc  n_gc n_itr total_time result                    memory                  time     gc              
  <chr>      <bch:tm> <bch:tm> <bch:tm> <bch:tm>     <dbl> <bch:byt> <dbl> <int>   <bch:tm> <list>                    <list>                  <list>   <list>          
1 mtd2()        2.75s    2.75s    2.75s    2.75s     0.364     1.9GB    54     1      2.75s <data.table [10,000 x 5]> <Rprofmem [82,295 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd3()        1.29s    1.29s    1.29s    1.29s     0.774    13.6MB     4     1      1.29s <data.table [10,000 x 5]> <Rprofmem [2,432 x 3]>  <bch:tm> <tibble [1 x 3]>

数据：

library(data.table)
set.seed(1234) # reproducibility
nr <- 1e6
ng <- 100
dt1 <- data.table(id=sample(ng, nr, TRUE), 
    group=sample(ng, nr, TRUE), 
    value=rnorm(nr))

在data.table内部执行for循环

1 个答案: