R data.table的加速查询 - 这个双参数函数能否更快地被组应用?

时间:2017-10-20 10:41:29

标签: r performance data.table

是否可以使用data.table在数据集中按组快速应用双参数功能?在一百万行数据集中,我发现调用下面定义的简单函数需要超过11秒,这比我对这种复杂性的预期要长得多。

下面的自包含代码概述了我要做的事情的基本要点:

# generate data frame - 1 million rows
library(data.table)
set.seed(42)
nn = 1e6
daf = data.frame(aa=sample(1:1000, nn, repl=TRUE),
                 bb=sample(1:1000, nn, repl=TRUE),
                 xx=rnorm(nn),
                 yy=rnorm(nn),
                 stringsAsFactors=FALSE)

# myfunc is the function to apply to each group
myfunc = function(xx, yy) {
  if (max(yy)>1) { 
    return(mean(xx))
  } else {
    return(weighted.mean(yy, ifelse(xx>0, 2, 1)))
  }
}

# running the function takes around 11.5 seconds
system.time({
  dt = data.table(daf, key=c("aa","bb"))
  dt = dt[,myfunc(xx, yy), by=c("aa","bb")]
})

head(dt)
# OUTPUT:
#    aa bb          V1
# 1:  1  2 -1.02605645
# 2:  1  3 -0.49318243
# 3:  1  4  0.02165797
# 4:  1  5  0.40811793
# 5:  1  6 -1.00312393
# 6:  1  7  0.14754417

有没有办法大幅缩短这样的函数调用时间?

我感兴趣的是,在没有完全重写函数调用的情况下是否有更有效的方法来执行上述计算,或者是否只能通过拆分函数来加速它,并以某种方式在data.table语法中重写它

非常感谢您的回复。

3 个答案:

答案 0 :(得分:2)

您的结果:

system.time({
  dt = data.table(daf, key = c("aa","bb"))
  dt = dt[,myfunc(xx, yy), by = c("aa","bb")]
}) # 21.25
dtInitial <- copy(dt)

V1:如果NA值与您无关,您可以像这样修改您的功能:

myfunc2 = function(xx, yy) {
  if (max(yy) > 1) { 
    return(mean(xx))
  } else {
    w <- ifelse(xx > 0, 2, 1)
    return(sum((yy * w)[w != 0])/sum(w))
  }
}

system.time({
  dt = data.table(daf, key = c("aa","bb"))
  dtM = dt[, myfunc2(xx, yy), by = c("aa","bb")]
}) # 6.69  
all.equal(dtM, dtInitial)
# [1] TRUE

V2:另外,你可以这样做得更快:

system.time({
dt3 <- data.table(daf, key = c("aa","bb"))
dt3[, maxy := max(yy), by = c("aa","bb")]
dt3[, meanx := mean(xx), by = c("aa","bb")]
dt3[, w := ifelse(xx > 0, 2, 1)]
dt3[, wm2 := sum((yy * w)[w != 0])/sum(w), by = c("aa","bb")]
r2 <- dt3[, .(aa, bb, V1 = ifelse(maxy > 1, meanx, wm2))]
r2 <- unique(r2)
}) #2.09 
all.equal(r2, dtInitial)
# [1] TRUE

20 sek vs 2 sek for me

更新

或者快一点:

system.time({
  dt3 <- data.table(daf, key = c("aa","bb"))
  dt3[, w := ifelse(xx > 0, 2, 1)]
  dt3[, yyw := yy * w]
  r2 <- dt3[, .(maxy = max(yy),
                meanx = mean(xx),
                wm2 = sum(yyw)/sum(w)), 
            , by = c("aa","bb")]
  r2[, V1 := ifelse(maxy > 1, meanx, wm2)]
  r2[, c("maxy", "meanx", "wm2") := NULL]
}) # 1.51

all.equal(r2, dtInitial)
# [1] TRUE

答案 1 :(得分:1)

另一种解决方案

system.time({
  dat <- data.table(daf, key = c("aa","bb"))
  dat[, xweight := (xx > 0) * 1 + 1]
  result <- dat[, list(MaxY = max(yy), Mean1 = mean(xx), Mean2 = sum(yy*xweight)/sum(xweight)), keyby=c("aa", "bb")]
  result[, FinalMean := ifelse(MaxY > 1, Mean1, Mean2)]
})

   user  system elapsed 
  1.964   0.059   1.348

答案 2 :(得分:1)

我找到了进一步加速8倍的方法,这可以将机器上的时间缩短到0.2秒左右。见下文。我们不是直接为每个组计算sum(yyw)/ sum(w),而是耗费时间,而是计算每个组的sum(yyw)和sum(w)的数量,然后才进行除法。魔术!

system.time({
  dt <- data.table(daf, key = c("aa","bb"))
  dt[, w := 1][xx > 0, w := 2]
  dt[, yyw := yy * w]
  res <- dt[, .(maxy = max(yy), 
                meanx = mean(xx), 
                wm2num = sum(yyw), 
                wm2den = sum(w)),
                by = c("aa","bb")]
  res[, wm2 := wm2num/wm2den]            
  res[, V1 := wm2][maxy > 1, V1 := meanx]

  res[, c("maxy", "meanx", "wm2num", "wm2den", "wm2") := NULL]
}) # 0.19

all.equal(res, dtInitial)
# [1] TRUE