有没有人开发出一种优雅,快捷的方式来按日期执行滚动总和?例如,如果我想通过Cust_ID为以下数据集创建一个180天的滚动总计,有没有办法更快地完成它(比如data.table中的某些内容)。我一直在使用以下示例来计算滚动总和,但我担心它效率低下。
library("zoo")
library("plyr")
library("lubridate")
##Make some sample variables
set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
32,34,66,97,151,188,211,213,241,274,294,321,
33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)
##Combine into one dataset
Example.Data <- data.frame(Trans_Dates,Cust_ID,Target)
##Create extra variable with 180 day rolling sum
Example.Data2 <- ddply(Example.Data, .(Cust_ID),
function(datc) adply(datc, 1,
function(x) data.frame(Target_Running_Total =
sum(subset(datc, Trans_Dates>(as.Date(x$Trans_Dates)-180) & Trans_Dates<=x$Trans_Dates)$Target))))
#Print new data
Example.Data2
答案 0 :(得分:2)
假设您的小组具有或多或少的平衡,那么我怀疑expand.grid
和ave
会非常快(您必须对数据进行基准测试才能确定) 。我使用expand.grid
来填补缺失的日期,以便我可以天真地使用cumsum
获取滚动金额,然后使用head
减去除最近180之外的所有内容。
- 作为一个问题(以及更熟练的R用户),为什么我的identical
电话总是失败? -
我建立了相同的数据。
full <- expand.grid(seq(from=min(Example.Data$Trans_Dates), to=max(Example.Data$Trans_Dates), by=1), unique(Example.Data$Cust_ID))
Example.Data3 <- merge(Example.Data, full, by.x=c("Trans_Dates", "Cust_ID"), by.y=c("Var1", "Var2"), all=TRUE)
Example.Data3 <- Example.Data3[with(Example.Data3, order(Cust_ID, Trans_Dates)), ]
Example.Data3$Target.New <- ifelse(is.na(Example.Data3$Target), 0, Example.Data3$Target)
Example.Data3$Target_Running_Total <- ave(Example.Data3$Target.New, Example.Data3$Cust_ID, FUN=function(x) cumsum(x) - c(rep(0, 180), head(cumsum(x), -180)))
Example.Data3$Target.New <- NULL
Example.Data3 <- Example.Data3[complete.cases(Example.Data3), ]
row.names(Example.Data3) <- seq(nrow(Example.Data3))
Example.Data3
identical(Example.Data2$Target_Running_Total, Example.Data3$Target_Running_Total)
sum(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total)
(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total)
产生以下结果。
> (Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total)
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
答案 1 :(得分:1)
我想我偶然发现了一个相当有效的答案。
set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
32,34,66,97,151,188,211,213,241,274,294,321,
33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)
##Make simulated data into a data.table
library(data.table)
data <- data.table(Cust_ID,Trans_Dates,Target)
##Assign each customer an number that ranks them
data[,Cust_No:=.GRP,by=c("Cust_ID")]
##Create "list" of comparison dates
Ref <- data[,list(Compare_Value=list(I(Target)),Compare_Date=list(I(Trans_Dates))), by=c("Cust_No")]
##Compare two lists and see of the compare date is within N days
data$Roll.Val <- mapply(FUN = function(RD, NUM) {
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d <= 0 & d >= -180)*Ref$Compare_Value[[NUM]])
}, RD = data$Trans_Dates,NUM=data$Cust_No)
##Print out data
data <- data[,list(Cust_ID,Trans_Dates,Target,Roll.Val)][order(Cust_ID,Trans_Dates)]
data
答案 2 :(得分:1)
library(data.table)
set.seed(1)
data <- data.table(Cust_ID = c(rep(1, 12), rep(2, 12), rep(3, 12)),
Trans_Dates = as.Date(c(31, 33, 65, 96, 150, 187, 210,
212, 240, 273, 293, 320, 32, 34,
66, 97, 151, 188, 211, 213, 241,
274, 294, 321, 33, 35, 67, 98,
152, 189, 212, 214, 242, 275,
295, 322),
origin = "2010-01-01"),
Target = rpois(36, 3))
data[, RollingSum := {
d <- data$Trans_Dates - Trans_Dates
sum(data$Target[Cust_ID == data$Cust_ID & d <= 0 & d >= -180])
},
by = list(Trans_Dates, Cust_ID)]