数据框dataAlog
包含变量'CashInd'
,'ACCOUNT'
,'DateTime'
和'Cnt'
(所有观察结果均为1)。每个ACCOUNT
和CashInd
的 4天 内的汇总计数。我尝试使用for循环来处理问题,虽然它很慢。我想改为使用apply
函数。
for (i in 1:dim(dataAlog)[1]) {
data <- subset(dataAlog, (dataAlog$CashInd==dataAlog$CashInd[i] & dataAlog$ACCOUNT==dataAlog$ACCOUNT[i]) & (dataAlog$DateTime<=dataAlog$DateTime[i]) & (dataAlog$DateTime >= as.POSIXct(as.Date(dataAlog$DateTime[i])-4)) )
if (dim(data)[1]==0) {
dataAlog$day4Count[i] <- 1
}
else{
agg <- aggregate(cnt ~ ACCOUNT, data = data, FUN=sum)
dataAlog$day4Count[i] <- agg$cnt
}
}
答案 0 :(得分:1)
以下是使用data.table
的建议解决方案。您可以使用滚动窗口4并计算窗口中最新日期4天前的日期数
数据:
library(zoo)
library(data.table)
#generate sample data
set.seed(0L)
N <- 20
dat <- data.table(CashInd=rep(1:2, N),
ACCOUNT=rep(LETTERS[1:2], each=N),
DateTime=sample(seq(Sys.Date()-2*N, Sys.Date(), "1 day"), N),
Amt=rnorm(2*N))
#it is impt that the data is sorted
setorder(dat, CashInd, ACCOUNT, DateTime)
计算:
#roll with window of 4 and calculate the number of dates after the date 4 days ago of latest date
nDaysAgo <- 4
dat[, c("COUNT", "CumuAmt") := {
list(sapply(DateTime, function(day) sum(between(DateTime, day-nDaysAgo, day))),
sapply(DateTime, function(day) sum(Amt[between(DateTime, day-nDaysAgo, day)])))
},
by = c("CashInd", "ACCOUNT")]
结果:
# CashInd ACCOUNT DateTime Amt COUNT CumuAmt
# 1: 1 A 2018-02-24 -0.22426789 1 -0.22426789
# 2: 1 A 2018-02-28 0.13333636 2 -0.09093152
# 3: 1 A 2018-03-07 1.08576936 1 1.08576936
# 4: 1 A 2018-03-09 -1.14765701 2 -0.06188765
# 5: 1 A 2018-03-17 -1.28459935 1 -1.28459935
# 6: 1 A 2018-03-26 0.25222345 1 0.25222345
# 7: 1 A 2018-03-28 -0.29921512 2 -0.04699167
# 8: 1 A 2018-03-31 0.76359346 2 0.46437834
# 9: 1 A 2018-04-01 0.43568330 3 0.90006164
# 10: 1 A 2018-04-03 -0.05710677 3 1.14216999
# 11: 1 B 2018-02-24 1.75790309 1 1.75790309
# 12: 1 B 2018-02-28 -0.45278397 2 1.30511912
# 13: 1 B 2018-03-07 -1.56378205 1 -1.56378205
# 14: 1 B 2018-03-09 -0.43331032 2 -1.99709237
# 15: 1 B 2018-03-17 0.83204713 1 0.83204713
# 16: 1 B 2018-03-26 0.99216037 1 0.99216037
# 17: 1 B 2018-03-28 0.72675075 2 1.71891111
# 18: 1 B 2018-03-31 -0.23570656 2 0.49104419
# 19: 1 B 2018-04-01 1.23830410 3 1.72934829
# 20: 1 B 2018-04-03 -1.16657055 3 -0.16397300
# 21: 2 A 2018-03-01 0.37739565 1 0.37739565
# 22: 2 A 2018-03-02 -0.41151083 2 -0.03411519
# 23: 2 A 2018-03-03 0.04672617 3 0.01261099
# 24: 2 A 2018-03-05 -0.79900925 4 -0.78639826
# 25: 2 A 2018-03-12 -0.69095384 1 -0.69095384
# 26: 2 A 2018-03-14 0.80418951 2 0.11323567
# 27: 2 A 2018-03-15 -1.23753842 3 -1.12430275
# 28: 2 A 2018-03-16 -0.28946157 4 -1.41376433
# 29: 2 A 2018-03-27 -0.89192113 1 -0.89192113
# 30: 2 A 2018-03-29 0.50360797 2 -0.38831316
# 31: 2 B 2018-03-01 0.56074609 1 0.56074609
# 32: 2 B 2018-03-02 1.15191175 2 1.71265784
# 33: 2 B 2018-03-03 -0.22732869 3 1.48532915
# 34: 2 B 2018-03-05 -0.54288826 4 0.94244090
# 35: 2 B 2018-03-12 1.15653700 1 1.15653700
# 36: 2 B 2018-03-14 -0.83204330 2 0.32449370
# 37: 2 B 2018-03-15 -0.27934628 3 0.04514742
# 38: 2 B 2018-03-16 -0.64947165 4 -0.60432423
# 39: 2 B 2018-03-27 -0.42951311 1 -0.42951311
# 40: 2 B 2018-03-29 -1.06559058 2 -1.49510369
# CashInd ACCOUNT DateTime Amt COUNT CumuAmt
另见Cumulative sum in a window (or running window sum) based on a condition in R
中的解决方案2