根据连续因子实例对列进行求和

时间:2017-04-06 09:30:02

标签: r

我有鱼标签信号的数据集,我想根据游泳速度计算不同行为的持续时间,例如:静态,巡航,爆发,所以我可以计算行为状态频率。我使用for循环完成了此操作,但在我的大型数据集上它非常慢。我确信这可以使用R的apply功能之一来完成,但我无法弄清楚如何做到这一点。

这就是我的数据:

    Period PEN SEC BLSEC     BS  BScount CountTF  BSdur
380   7045   7   7 0.204 cruise        2   FALSE     NA
381   7045   7   7 0.694 cruise        3   FALSE     NA
382   7045   7   7 0.325 cruise        4    TRUE     21
383   7045   7   7 0.000 static        1    TRUE      7
384   7045   7   7 0.197 cruise        1   FALSE     NA
385   7045   7   7 0.312 cruise        2   FALSE     NA
386   7045   7   7 0.242 cruise        3    TRUE     21
387   7045   7   7 0.096 static        1    TRUE      7
388   7045   7   7 0.274 cruise        1   FALSE     NA
389   7045   7   7 0.268 cruise        2   FALSE     NA
390   7045   7   7 0.312 cruise        3   FALSE     NA
391   7045   7   7 0.694 cruise        4   FALSE     NA
392   7045   7   7 0.268 cruise        5   FALSE     NA

SEC是标签ping之间的秒数(它并不总是7!),BLSEC是每秒的体长(即标签ping之间的鱼的标准化距离游泳)。我通过以下方式计算了BS,BScount和CountTF:

static = 0.1
cruise = 1

bsffile$BS <- ifelse(bsffile$BLSEC <= static, 'static', ifelse(bsffile$BLSEC > static & bsffile$BLSEC <= cruise, 'cruise', 'burst'))
bsffile$BScount <- sequence(rle(bsffile$BS)$lengths)
bsffile$CountTF <- c(ifelse(diff(bsffile$BScount, 1, 1) < 1, T, F), F)

BSdur是连续行为状态的SEC的总和。我用以下方法计算了它:

bssum <- 0

for (i in 1:nrow(bsffile)){
  bssum <- bssum + bsffile[i, 'SEC']
  if(bsffile[i, 'CountTF'] == T & is.na(bsffile[i, 'SEC']) == F){
    bsffile[i,'BSdur'] <- bssum
    bssum <- 0
  } else {
    bsffile[i,'BSdur'] <- NA    
  }
}

在我的数据集上运行大约需要五分钟。有什么建议我可以更快地加快速度,例如使用apply函数之一吗?

以下是一些dput

structure(list(Period = c(7045, 7045, 7045, 7045, 7045, 7045, 
7045, 7045, 7045, 7045, 7045, 7045, 7045, 7045, 7045, 7045, 7045, 
7045, 7045, 7045, 7045), PEN = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("7", "8"), class = "factor"), SEC = c(7, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7), BLSEC = c(0.204, 
0.694, 0.325, 0, 0.197, 0.312, 0.242, 0.096, 0.274, 0.268, 0.312, 
0.694, 0.268, 0.541, 0.796, 0.306, 0.089, 0.93, 0.389, 0.452, 
0.917), BS = c("cruise", "cruise", "cruise", "static", "cruise", 
"cruise", "cruise", "static", "cruise", "cruise", "cruise", "cruise", 
"cruise", "cruise", "cruise", "cruise", "static", "cruise", "cruise", 
"cruise", "cruise"), BScount = c(2L, 3L, 4L, 1L, 1L, 2L, 3L, 
1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 1L, 2L, 3L, 4L), CountTF = c(FALSE, 
FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, 
TRUE), BSdur = c(NA, NA, 21, 7, NA, NA, 21, 7, NA, NA, NA, NA, 
NA, NA, NA, 57, 7, NA, NA, NA, 28)), row.names = 380:400, .Names = c("Period", 
"PEN", "SEC", "BLSEC", "BS", "BScount", "CountTF", "BSdur"
), class = "data.frame")

2 个答案:

答案 0 :(得分:2)

轻松data.table

library(data.table)
setDT(bsffile)
bsffile[,BSdur:=ifelse(CountTF==T,sum(SEC),0),by=.(rleid(BS))]

答案 1 :(得分:0)

我们可以使用ave

中的base R执行此操作
df1$BSdur <- with(df1, ave(SEC, cumsum(c(TRUE, BS[-1]!= BS[-nrow(df1)])), FUN = sum)*CountTF)
df1$BSdur
#[1]  0  0 21  7  0  0 21  7  0  0  0  0  0  0  0 57  7  0  0  0 28