如何将这个双索引for循环转换为有效的循环?

时间:2016-05-30 18:52:37

标签: r loops for-loop lapply mapply

我是R和这里的新人。我经常使用这个网站,但这次我遇到了一个无法找到解决方案的问题。

我有一个这样的数据框:

df <- data.frame(id = c("A","A","B","B","B","C","D","D","D","D"),
             elapsed = c(10,15,5,7,20,4,3,4,15,16),
             val = c(5,8,1,2,4,6,7,9,8,3),
             acum = c(0,0,0,0,0,0,0,0,0,0))

我需要累积每一行,总数&#34; val&#34;来自其他具有相同&#34; id&#34;的行和#34;已过去&#34;电流不超过5个单位(以秒为单位测量)。

I.e。:我选择一行,比如说i,然后注册它的&#34; val&#34; in acum [i]。然后我回顾前一行,i-1。我检查它是否具有与其相同的ID,并且#34;已过去&#34;不小于第i行的5。如果是,我将它的值加到acum [i]。 我重复所有优势滞后的步骤,依此类推。

为了做到这一点,我打电话给这个循环:

 for (i in 2:nrow(df)) {
   for(l in 0:nrow(df)) {
    if(l<i) {
      if (df[i,"id"]==df[i-l,"id"]) 
        {if (df[i,"elapsed"]-df[i-l,"elapsed"]<=5)
       {df$acum[i] <- df$acum[i]+df[i-l,"val"]}
      }
    } 
   }
 }

#    id elapsed val acum
# 1   A      10   5    0
# 2   A      15   8   13
# 3   B       5   1    1
# 4   B       7   2    3
# 5   B      20   4    4
# 6   C       4   6    6
# 7   D       3   7    7
# 8   D       4   9   16
# 9   D      15   8    8
# 10  D      16   3   11

问题是随着data.frame变大,循环花费的时间越来越多(甚至几个小时)。

我浏览了google并浏览了Stackoverflow,每个人都给出了同样的建议:使用apply函数。但在这种情况下,我无法弄清楚如何做到这一点。也许是mapply,但由于我是新手,我不知道如何。

有人可以帮我吗?

提前谢谢你,圣地亚哥.-

2 个答案:

答案 0 :(得分:0)

考虑按id组创建滞后变量,然后运行ifelse()

# LAGGED GROUP VARIABLES
df$lastelapsed <- sapply(1:nrow(df),
                         function(i) sum((df$id[i-1] == df$id[i]) * df$elapsed[i-1]))
df$lastvalue <- sapply(1:nrow(df),
                         function(i) sum((df$id[i-1] == df$id[i]) * df$val[i-1]))

# ROW CALCULATION
df$acumtest <- ifelse((df$elapsed - df$lastelapsed) <= 5, df$val + df$lastvalue, df$val)

答案 1 :(得分:0)

您可以根据每个ID中的elapsed > 5条件创建新的分组变量,然后使用您喜欢的聚合工具

df <- data.frame(id = c("A","A","B","B","B","C","D","D","D","D"),
                 elapsed = c(10,15,5,7,20,4,3,4,15,16),
                 val = c(5,8,1,2,4,6,7,9,8,3),
                 acum = c(0,0,0,0,0,0,0,0,0,0))

within(df, {
  grp <- paste(id, ave(elapsed, id, FUN = function(x)
    cumsum(c(FALSE, diff(x) > 5))))
  acum <- ave(val, grp, FUN = cumsum)
})

#    id elapsed val acum grp
# 1   A      10   5    5 A.0
# 2   A      15   8   13 A.0
# 3   B       5   1    1 B.0
# 4   B       7   2    3 B.0
# 5   B      20   4    4 B.1
# 6   C       4   6    6 C.0
# 7   D       3   7    7 D.0
# 8   D       4   9   16 D.0
# 9   D      15   8    8 D.1
# 10  D      16   3   11 D.1

目前解决方案的一些基准:

library('dplyr')
library('data.table')

rawr <- function(df) {
  df <- within(df, {
    grp <- paste(id, ave(elapsed, id, FUN = function(x)
      cumsum(c(FALSE, diff(x) > 5))))
    acum <- ave(val, grp, FUN = cumsum)
    })
  df
}

## shitty data table version, I'm sure it's wrong
## rest assured someone will point it out
rawr_dt <- function(df) {
  dt <- as.data.table(df)
  dt[, grp := cumsum(c(FALSE, diff(elapsed) > 5)), by = 'id'][, acum := cumsum(val), c('id', 'grp')]
  dt[, grp := NULL]
  dt
}

sfucci <- function(df) {
  for (i in 2:nrow(df)) {
    for(l in 0:nrow(df)) {
      if(l<i) {
        if (df[i,"id"]==df[i-l,"id"]) 
        {if (df[i,"elapsed"]-df[i-l,"elapsed"]<=5)
        {df$acum[i] <- df$acum[i]+df[i-l,"val"]}
        }
      } 
    }
  }
  df
}

Parfait <- function(df) {
  df$lastelapsed <- sapply(1:nrow(df), function(i) sum((df$id[i-1] == df$id[i]) * df$elapsed[i-1]))
  df$lastvalue <- sapply(1:nrow(df), function(i) sum((df$id[i-1] == df$id[i]) * df$val[i-1]))
  df$acumtest <- ifelse((df$elapsed - df$lastelapsed) <= 5, df$val + df$lastvalue, df$val)
  df
}
alistaire <- function(df) {
  df %>%
    group_by(id) %>%
    mutate(acum = val + (lag(val, default = 0) *
                           ifelse(lag(elapsed, default = 0) >= (elapsed - 5), 1, 0)))
}

acc <- rawr(df)$acum
identical(acc, rawr_dt(df)$acum)
# [1] TRUE
# identical(acc, sfucci(df)$acum)
identical(acc, Parfait(df)$acumtest)
# [1] TRUE
identical(acc, alistaire(df)$acum)
# [1] TRUE

library('microbenchmark')
microbenchmark(sfucci(df), rawr(df), rawr_dt(df), Parfait(df), alistaire(df), unit = 'relative')

# Unit: relative
#           expr       min       lq      mean   median        uq      max neval   cld
#     sfucci(df) 11.596961 9.990698 10.082249 9.952529 10.220162 5.603044   100     e
#       rawr(df)  1.000000 1.000000  1.000000 1.000000  1.000000 1.000000   100 a    
#    rawr_dt(df)  3.771649 3.483610  3.472160 3.436365  3.531379 1.945339   100    d 
#    Parfait(df)  3.392426 2.980234  3.008432 2.902410  3.006896 2.361832   100   c  
#  alistaire(df)  2.140693 2.042809  2.080444 2.028151  2.029965 2.638486   100  b