Question

我有一些分层数据，需要分别对每个阶层进行操作。我设法用for循环（见下面的例子）。但是，循环太慢了，因为我正在处理一个庞大的数据集。我相信必须有办法加快速度，例如：具有apply功能，但不幸的是我无法找到更好的解决方案。

问题：我怎样才能提高此操作的速度？

# Some example data (do not care about the data creation, only the loop is important)

set.seed(123)

N <- 100

strata <- round(runif(N, 1, 1000)) # Strata

x1 <- rpois(N, lambda = 50) # Variable 1
x2 <- rpois(N, lambda = 50) # Variable 2

ind1 <- as.factor(rbinom(N, 1, 0.3)) # Group indicator 1
ind2 <- as.factor(rbinom(N, 1, 0.6)) # Group indicator 2

x1[ind1 == 0] <- 0
x2[ind1 == 0] <- 0
x1[ind2 == 0] <- 0
x2[ind2 == 1] <- 0

x1_sum <- sum(x1)
x2_sum <- sum(x2)

# # # # # The folowing loop is too slow # # # # #

new_values <- x2 # Apply the following operation strata by strata

for(i in 1:length(table(strata))) {

  x1_sum_strata <- sum(x1[strata == as.numeric(names(table(strata)))[i]])

  x2_sum_strata <- sum(x2[strata == as.numeric(names(table(strata)))[i]])

  new_values[x1 == 0 & ind1 == 1 & strata == as.numeric(names(table(strata)))[i]] <- 
    (x1_sum / x2_sum) * (x1_sum_strata / x2_sum_strata)
}

Answer 1

# # # # # loop # # # # #

new_values <- x2 # Apply the following operation strata by strata

st <- table(strata)
sst <- as.numeric(names(st))
i1 <- x1 == 0 
i2 <- ind1 == 1
is <- i1 & i2
for(i in 1:length(st)) {
  ii  <- strata == sst[i]
  x1_sum_strata <- sum(x1[ii])
  x2_sum_strata <- sum(x2[ii])

  new_values[is & ii] <-  (x1_sum / x2_sum) * (x1_sum_strata / x2_sum_strata)
}

基准：

N <- 10000
rbenchmark::benchmark(antonios(), minem(), replications= 10)
#         test replications elapsed relative user.self sys.self user.child sys.child
# 1 antonios()           10    8.77   11.101      5.58     1.70         NA        NA
# 2    minem()           10    0.79    1.000      0.76     0.02         NA        NA

Answer 2

我认为@digEmAll是对的，瓶颈不在你的循环中。让我们把数据放大一点：

set.seed(123)

N <- 1000
strata <- round(runif(N, 1, 10000)) # Strata
x1 <- rpois(N, lambda = 50) # Variable 1
x2 <- rpois(N, lambda = 50) # Variable 2

ind1 <- as.factor(rbinom(N, 1, 0.3)) # Group indicator 1
ind2 <- as.factor(rbinom(N, 1, 0.6)) # Group indicator 2

x1[ind1 == 0] <- 0
x2[ind1 == 0] <- 0
x1[ind2 == 0] <- 0
x2[ind2 == 1] <- 0

x1_sum <- sum(x1)
x2_sum <- sum(x2)

# # # # # The folowing loop is too slow # # # # #

new_values <- x2 # Apply the following operation strata by strata

现在用你的方法在我的电脑上运行需要大约10秒

> system.time(for(i in 1:length(table(strata))) {
+   x1_sum_strata <- sum(x1[strata == as.numeric(names(table(strata)))[i]])
+   x2_sum_strata <- sum(x2[strata == as.numeric(names(table(strata)))[i]])
+   new_values[x1 == 0 & ind1 == 1 & strata == as.numeric(names(table(strata)))[i]] <- 
+     (x1_sum / x2_sum) * (x1_sum_strata / x2_sum_strata)
+ })
   user  system elapsed 
   9.67    0.02    9.71 
>

但如果你在一个新变量中设置as.numeric（names（table（strata））），它的运行速度大约快100倍：

> x=as.numeric(names(table(strata)))
> system.time(for(i in 1:length(table(strata))) {
+   x1_sum_strata <- sum(x1[strata == x[i]])
+   x2_sum_strata <- sum(x2[strata == x[i]])
+   new_values[x1 == 0 & ind1 == 1 & strata == x[i]] <- (x1_sum / x2_sum) * (x1_sum_strata / x2_sum_strata)
+ }
+ )
   user  system elapsed 
   0.11    0.00    0.11 
>

Answer 3

我发现编写一个在单个层上运行的函数是有帮助的，并且只对该层执行必要的计算;然后，您可以调试边缘情况的函数等。

f = function(x, y) sum(x) / sum(y)

把'tidyverse'放在心上，通常用cibbles（data.frames）和一些简单的操作（按层次对数据进行分组;对每个组进行分组）来对它们进行思考

library(tidyverse)
tbl = tbl(x1, x2, strata)
ans0 = group_by(tbl, strata) %>% summarize(value = f(x1, x2))

然后可以考虑如何修改此结果以获得最终答案，例如，通过将完整数据中的值缩放每个阶层的值

ans = mutate(ans0, value = f(tbl$x1, tbl$x2) * value)

关于这一点的一个好处是结果是一个tibble，所以整个过程可以用相同类型的操作重复进行分析的下一步。

如何提高复杂for循环的速度？

3 个答案: