Question

我正在努力渲染以下函数的重复应用，我目前将其作为for循环实现。这个小例子表明了较大数据集的问题，对其进行矢量化可以实现有益的运行时改进：

action = function(x,y,i) {

    firsttask = cumsum(x[which(x<y[i])])
    secondtask = mean(firsttask)
    thirdtask = min(firsttask[which(firsttask>secondtask)])
    fourthtask = length(firsttask)

    output = list(firsttask, data.frame(average=secondtask,
                                        min_over_mean=thirdtask,
                                        size=fourthtask))
    return(output)
}

thingofinterest = c(1:10)
limits = c(5:10)

test = vector("list", length = length(limits))
for(i in 1:length(limits)) {
test[[i]] = action(thingofinterest, limits, i)
}

test

我想用向量化命令替换for循环，而不是任何apply系列函数，因为它们并不总能提高性能（我并不是说for循环有任何问题，在这种情况下，我只需要针对速度进行优化。请参阅：Is R's apply family more than syntactic sugar?）。我该怎么做？

Answer 1

您需要了解代码之前的瓶颈位置，然后才开始尝试更改它以加快速度。例如：

timer <- function(action, thingofinterest, limits) {
  st <- system.time({           # for the wall time
    Rprof(interval=0.01)        # Start R's profile timing
    for(j in 1:1000) {          # 1000 function calls
      test = vector("list")
      for(i in 1:length(limits)) {
        test[[i]] = action(thingofinterest, limits, i)
      }
    }
    Rprof(NULL)  # stop the profiler
  })
  # return profiling results
  list(st, head(summaryRprof()$by.total))
}
action = function(x,y,i) {
  firsttask = cumsum(x[which(x<y[i])])
  secondtask = min(firsttask[which(firsttask>mean(firsttask))])
  thirdtask = mean(firsttask)
  fourthtask = length(firsttask)
  output = list(firsttask, data.frame(average=secondtask,
                                      min_over_mean=thirdtask,
                                      size=fourthtask))
  return(output)
}
timer(action, 1:1000, 50:100)
# [[1]]
#    user  system elapsed 
#   9.720   0.012   9.737 
# 
# [[2]]
#                 total.time total.pct self.time self.pct
# "system.time"         9.72    100.00      0.07     0.72
# "timer"               9.72    100.00      0.00     0.00
# "action"              9.65     99.28      0.24     2.47
# "data.frame"          8.53     87.76      0.84     8.64
# "as.data.frame"       5.50     56.58      0.44     4.53
# "force"               4.40     45.27      0.11     1.13

您可以看到在外调用action功能的时间非常短。现在，for是一个特殊的原语，因此不会被分析器捕获，但分析器报告的总时间与墙上时间非常相似，因此分析器不会丢失很多时间时间。

action函数中花费最多时间的是调用data.frame。删除它，你获得了巨大的加速。

action1 = function(x,y,i) { firsttask = cumsum(x[which(x<y[i])]) secondtask = mean(firsttask) thirdtask = min(firsttask[which(firsttask>mean(firsttask))]) fourthtask = length(firsttask) list(task=firsttask, average=secondtask, min_over_mean=thirdtask, size=fourthtask) } timer(action1, 1:1000, 50:100) # [[1]] # user system elapsed # 1.020 0.000 1.021 # # [[2]] # total.time total.pct self.time self.pct # "system.time" 1.01 100.00 0.06 5.94 # "timer" 1.01 100.00 0.00 0.00 # "action" 0.95 94.06 0.17 16.83 # "which" 0.57 56.44 0.23 22.77 # "mean" 0.25 24.75 0.13 12.87 # "<" 0.20 19.80 0.20 19.80

现在，您还可以拨打其中一个mean来电并同时拨打which。

action2 = function(x,y,i) { firsttask = cumsum(x[x < y[i]]) secondtask = mean(firsttask) thirdtask = min(firsttask[firsttask > secondtask]) fourthtask = length(firsttask) list(task=firsttask, average=secondtask, min_over_mean=thirdtask, size=fourthtask) } timer(action2, 1:1000, 50:100) # [[1]] # user system elapsed # 0.808 0.000 0.808 # # [[2]] # total.time total.pct self.time self.pct # "system.time" 0.80 100.00 0.12 15.00 # "timer" 0.80 100.00 0.00 0.00 # "action" 0.68 85.00 0.24 30.00 # "<" 0.20 25.00 0.20 25.00 # "mean" 0.13 16.25 0.08 10.00 # ">" 0.05 6.25 0.05 6.25

现在，您可以看到在action功能之外花费了大量时间。我把重要的引号放在引号中，因为它占运行时间的15％，但只有120毫秒。如果您的实际代码需要大约12个小时才能运行，那么这个新的action函数将在~1小时内完成。

如果我在test函数中的for循环之外预先分配了timer列表，结果会稍微好一点，但对data.frame的调用是最大的时间消费者。

Answer 2

以上是对我上述评论的一点比较。我已在评论中进行了更改（初始化测试，更改了action中的订单，如果您可以接受，我在data.frame的列表输出中删除了action调用：

library(microbenchmark)
microbenchmark(f0(), f1())

Unit: microseconds
 expr       min        lq      mean     median        uq       max neval
 f0() 14042.192 14730.036 16091.508 15168.3175 16993.631 28193.770   100
 f1()   894.555   928.738  1094.448   985.2865  1190.252  4710.675   100

这些修改带来了大约15倍的速度。

用于比较的功能和数据：

action0 = function(x,y,i) {
  firsttask = cumsum(x[which(x<y[i])])
  secondtask = min(firsttask[which(firsttask>mean(firsttask))])
  thirdtask = mean(firsttask)
  fourthtask = length(firsttask)
  output = list(firsttask, data.frame(min_over_mean=secondtask,
                                      average=thirdtask,
                                      size=fourthtask))
  return(output)
}

f0 <- function() {
  test = vector("list")
  for(i in 1:length(limits)) {
    test[[i]] = action0(thingofinterest, limits, i)
  }
}

thingofinterest = c(1:1000)
limits = c(50:100)

action1 = function(x,y,i) {
  firsttask = cumsum(x[which(x<y[i])])
  thirdtask = mean(firsttask)
  secondtask = min(firsttask[which(firsttask>thirdtask)])
  fourthtask = length(firsttask)
  list(firsttask, min_over_mean=secondtask,
                                      average=thirdtask,
                                      size=fourthtask)
}

f1 <- function() {
  test = vector("list", length = length(limits))
  for(i in 1:length(limits)) {
    test[[i]] = action1(thingofinterest, limits, i)
  }
}

Answer 3

只是用* apply familly添加一个比较点我使用了这段代码（用identical(f1(),f2()) f3验证的结果返回不同的布局。）

经过测试，which调用可以提高大tingofinterest向量的速度。

thingofinterest = c(1:100000)
limits = c(50:1000)

action1 = function(x,y,i) {
  firsttask = cumsum(x[which(x<y[i])])
  thirdtask = mean(firsttask)
  secondtask = min(firsttask[which(firsttask>thirdtask)])
  fourthtask = length(firsttask)
  list(firsttask, min_over_mean=secondtask,
                                      average=thirdtask,
                                      size=fourthtask)
}

f1 <- function() {
  test = vector("list", length = length(limits))
  for(i in 1:length(limits)) {
    test[[i]] = action1(thingofinterest, limits, i)
  }
  return(test)
}


action2 <- function(x,y) {
  firsttask = cumsum(x[which(x<y)])
  thirdtask = mean(firsttask)
  secondtask = min(firsttask[which(firsttask>thirdtask)])
  fourthtask = length(firsttask)
  list(firsttask, min_over_mean=secondtask,
                  average=thirdtask,
                  size=fourthtask)
}

f2 <- function() {
  test <- lapply(limits,action2,x=thingofinterest)
  return(test)
}

f3 <- function() {
  test <- sapply(limits,action2,x=thingofinterest)
  return(test)
}

对于1M thingofinterest和950限制，这是我的机器上的结果：

> microbenchmark(f1(),f2(),f3(),times=10)
Unit: seconds
 expr      min       lq     mean   median       uq      max neval
 f1() 4.303302 4.336767 4.373119 4.380383 4.403434 4.441945    10
 f2() 4.267922 4.327208 4.450175 4.399422 4.423191 5.041011    10
 f3() 4.240551 4.293855 4.412548 4.362949 4.468117 4.730717    10

因此，在这种情况下，干净完成的for循环并不是那么糟糕。

我觉得可能有一种data.table方式可以在一次通过中完成“动作”工作，但现在它已经不在我的知识领域了。

关于速度主题的更多内容，我认为无法真正对其进行矢量化。这些向量不是彼此的子集，它们的cumsum不能被“剪切”以避免计算公共序列。

正如您在评论中所说，限制通常在90到110个条目之间，并行处理可能是计算不同核心上每次迭代的正确方法，因为每次迭代都独立于其他迭代。（mclapply的Thinkink，但可能还有其他更适合您用例的内容）

在R

3 个答案: