这就是我的数据框:
库(data.table)
df <- fread('
Name EventType Date SalesAmount RunningTotal Runningtotal(prior365Days)
John Email 1/1/2014 0 0 0
John Sale 2/1/2014 10 10 10
John Sale 7/1/2014 20 30 30
John Sale 4/1/2015 30 60 50
John Webinar 5/1/2015 0 60 50
Tom Email 1/1/2014 0 0 0
Tom Sale 2/1/2014 15 15 15
Tom Sale 7/1/2014 10 25 25
Tom Sale 4/1/2015 25 50 35
Tom Webinar 5/1/2015 0 50 35
')
df[,Date:= as.Date(Date, format="%m/%d/%Y")]
最后一列是我想要的列,它是过去365天滚动窗口中SalesAmount(对于每个Name)的累积总和,我在@ 6pool的帮助下执行了此操作。他的解决方案是:
df$EventDate <- as.Date(df$EventDate, format="%d/%m/%Y")
df <- df %>%
group_by (Name) %>%
arrange(EventDate) %>%
mutate(day = EventDate - EventDate[1])
f <- Vectorize(function(i)
sum(df[df$Name[i] == df$Name & df$day[i] - df$day >= 0 &
df$day[i] - df$day <= 365, "SalesAmount"]), vec="i")
df$RunningTotal365 <- f(1:nrow(df))
然而,由于我的数据帧超过150万行,因此df $ RunningTotal365&lt; -f(1:nrow(df))需要很长时间(超过1.5天)。我在最初的问题中被建议“rollapply”但我在这个例子中一直在努力弄清楚如何使用它。请帮助。
答案 0 :(得分:4)
尝试一下:
DF <- read.table(text = "Name EventType EventDate SalesAmount RunningTotal Runningtotal(prior365Days)
John Email 1/1/2014 0 0 0
John Sale 2/1/2014 10 10 10
John Sale 7/1/2014 20 30 30
John Sale 4/1/2015 30 60 50
John Webinar 5/1/2015 0 60 50
Tom Email 1/1/2014 0 0 0
Tom Sale 2/1/2014 15 15 15
Tom Sale 7/1/2014 10 25 25
Tom Sale 4/1/2015 25 50 35
Tom Webinar 5/1/2015 0 50 35", header = TRUE)
fun <- function(x, date, thresh) {
D <- as.matrix(dist(date)) #distance matrix between dates
D <- D <= thresh
D[lower.tri(D)] <- FALSE #don't sum to future
R <- D * x #FALSE is treated as 0
colSums(R)
}
library(data.table)
setDT(DF)
DF[, EventDate := as.Date(EventDate, format = "%m/%d/%Y")]
setkey(DF, Name, EventDate)
DF[, RT365 := fun(SalesAmount, EventDate, 365), by = Name]
# Name EventType EventDate SalesAmount RunningTotal Runningtotal.prior365Days. RT365
# 1: John Email 2014-01-01 0 0 0 0
# 2: John Sale 2014-02-01 10 10 10 10
# 3: John Sale 2014-07-01 20 30 30 30
# 4: John Sale 2015-04-01 30 60 50 50
# 5: John Webinar 2015-05-01 0 60 50 50
# 6: Tom Email 2014-01-01 0 0 0 0
# 7: Tom Sale 2014-02-01 15 15 15 15
# 8: Tom Sale 2014-07-01 10 25 25 25
# 9: Tom Sale 2015-04-01 25 50 35 35
#10: Tom Webinar 2015-05-01 0 50 35 35
答案 1 :(得分:4)
以下是使用foverlaps
包中的data.table
函数的方法:
require(data.table)
setDT(df)[, end := as.Date(EventDate, format="%d/%m/%Y")
][, start := end - 365L]
setkey(df, Name, start, end)
olaps = foverlaps(df, df, nomatch=0L, which=TRUE)
olaps = olaps[xid >= yid, .(ans = sum(dt$SalesAmount[yid])), by=xid]
df[olaps$xid, Runningtotal := olaps$ans]
如有必要,您可以执行以下操作删除start
和end
列:
df[, c("start", "end") := NULL]
很高兴知道它有多快/慢..
答案 2 :(得分:0)
在data.table中使用更新的非equi连接功能:
df1 = df[.(iName=Name,start = Date - 365L, end = Date),
on=.(Name=iName,Date >= start, Date <= end),nomatch = 0, allow.cart=TRUE][,
.(MyTotal = sum(SalesAmount)), by=.(Name,Date = Date.1)]
df[df1, on = .(Name,Date)]