Question

我需要定义操作块 - 所以我想将自上次操作后不到30天的单个ID的所有操作组合在一起。如果距离上一次操作超过30天，那么我会将标签增加一个（所以标签2,3,4 ......）。每个新ID都会从1开始。

以下是数据：

dat = data.frame(cbind(
  id = c(rep(1,2), rep(16,3), rep(17,24)),
  ##day_id is the action date in %Y%m%d format - I keep it as numeric but could potentially turn to a date.
  day_id = c(20130702, 20130121, 20131028, 20131028, 20130531, 20140513, 20140509,    
         20140430, 20140417, 20140411, 20140410, 20140404,
         20140320, 20140313, 20140305, 20140224, 20140213, 20140131, 20140114,  
         20130827, 20130820, 20130806, 20130730, 20130723,
         20130719, 20130716, 20130620, 20130620, 20130614 ),
  ###diff is the # of days between actions/day_ids
  diff =c(NA,162,NA,0,150,NA,4,9,13,6,1,6,15,7,8,9,11,13,17,140,7,14,
         7,7,4,3,26,0,6),
  ###Just a flag to say whether it's a new id
  new_id = c(1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
))

我用for循环完成它并设法避免循环内的循环（见下文），但似乎无法摆脱那个外循环。当然，成千上万的ID会变得极其缓慢。在下面的例子中，'call_block'是我试图重现但没有for循环。任何人都可以帮助我摆脱困境吗？

max_days = 30
r = NULL
for(i in unique(dat$id)){
  d = dat$diff[dat$id==i]
  w = c(1,which(d>=max_days) , length(d)+1)
  w2 = diff(w)

  r = c(r,rep(1:(length(w)-1), w2))
}
dat$call_block = r

谢谢！

Answer 1

在此发布@ alexis_laz答案以结束问题

library(data.table)
f = function(x){ 
    ret = c(1, cumsum((x >= 30)[-1]) + 1) 
    return(ret = ret) 
}
df = data.table(dat)
df2 = df[,list(call_block= f(diff)), by = id]

避免for循环：在一个时间范围内定义动作块

1 个答案: