在过去365天窗口中执行运行总计的有效方法

时间:2015-06-02 15:17:43

标签: r vectorization dplyr zoo rollapply

这就是我的数据框:

库(data.table)

df <- fread('
                Name  EventType  Date  SalesAmount RunningTotal Runningtotal(prior365Days)
                John    Email      1/1/2014      0          0            0
                John    Sale       2/1/2014     10          10           10
                John    Sale       7/1/2014     20          30           30
                John    Sale       4/1/2015     30          60           50 
                John    Webinar    5/1/2015      0          60           50
                Tom     Email      1/1/2014      0          0            0
                Tom     Sale       2/1/2014     15          15           15
                Tom     Sale       7/1/2014     10          25           25
                Tom     Sale       4/1/2015     25          50           35 
                Tom     Webinar    5/1/2015      0          50           35
                ')
    df[,Date:= as.Date(Date, format="%m/%d/%Y")]

最后一列是我想要的列,它是过去365天滚动窗口中SalesAmount(对于每个Name)的累积总和,我在@ 6pool的帮助下执行了此操作。他的解决方案是:

df$EventDate <- as.Date(df$EventDate, format="%d/%m/%Y")
df <- df %>%
   group_by (Name) %>%
   arrange(EventDate) %>% 
   mutate(day = EventDate - EventDate[1])

f <- Vectorize(function(i)
    sum(df[df$Name[i] == df$Name & df$day[i] - df$day >= 0 & 
             df$day[i] - df$day <= 365, "SalesAmount"]), vec="i")
df$RunningTotal365 <- f(1:nrow(df))

然而,由于我的数据帧超过150万行,因此df $ RunningTotal365&lt; -f(1:nrow(df))需要很长时间(超过1.5天)。我在最初的问题中被建议“rollapply”但我在这个例子中一直在努力弄清楚如何使用它。请帮助。

3 个答案:

答案 0 :(得分:4)

尝试一下:

DF <- read.table(text = "Name  EventType  EventDate  SalesAmount RunningTotal Runningtotal(prior365Days)
John    Email      1/1/2014      0          0            0
John    Sale       2/1/2014     10          10           10
John    Sale       7/1/2014     20          30           30
John    Sale       4/1/2015     30          60           50 
John    Webinar    5/1/2015      0          60           50
Tom     Email      1/1/2014      0          0            0
Tom     Sale       2/1/2014     15          15           15
Tom     Sale       7/1/2014     10          25           25
Tom     Sale       4/1/2015     25          50           35 
Tom     Webinar    5/1/2015      0          50           35", header = TRUE)


fun <- function(x, date, thresh) {
  D <- as.matrix(dist(date)) #distance matrix between dates
  D <- D <= thresh
  D[lower.tri(D)] <- FALSE #don't sum to future
  R <- D * x #FALSE is treated as 0
  colSums(R)
}


library(data.table)
setDT(DF)
DF[, EventDate := as.Date(EventDate, format = "%m/%d/%Y")]
setkey(DF, Name, EventDate)

DF[, RT365 := fun(SalesAmount, EventDate, 365), by = Name]

#    Name EventType  EventDate SalesAmount RunningTotal Runningtotal.prior365Days. RT365
# 1: John     Email 2014-01-01           0            0                          0     0
# 2: John      Sale 2014-02-01          10           10                         10    10
# 3: John      Sale 2014-07-01          20           30                         30    30
# 4: John      Sale 2015-04-01          30           60                         50    50
# 5: John   Webinar 2015-05-01           0           60                         50    50
# 6:  Tom     Email 2014-01-01           0            0                          0     0
# 7:  Tom      Sale 2014-02-01          15           15                         15    15
# 8:  Tom      Sale 2014-07-01          10           25                         25    25
# 9:  Tom      Sale 2015-04-01          25           50                         35    35
#10:  Tom   Webinar 2015-05-01           0           50                         35    35

答案 1 :(得分:4)

以下是使用foverlaps包中的data.table函数的方法:

require(data.table)
setDT(df)[, end := as.Date(EventDate, format="%d/%m/%Y")
        ][, start := end - 365L]
setkey(df, Name, start, end)
olaps = foverlaps(df, df, nomatch=0L, which=TRUE)
olaps = olaps[xid >= yid, .(ans = sum(dt$SalesAmount[yid])), by=xid]

df[olaps$xid, Runningtotal := olaps$ans]

如有必要,您可以执行以下操作删除startend列:

df[, c("start", "end") := NULL]

很高兴知道它有多快/慢..

答案 2 :(得分:0)

在data.table中使用更新的非equi连接功能:

    df1 = df[.(iName=Name,start = Date - 365L, end = Date),
    on=.(Name=iName,Date >= start, Date <= end),nomatch = 0, allow.cart=TRUE][,
  .(MyTotal = sum(SalesAmount)), by=.(Name,Date = Date.1)]


    df[df1, on = .(Name,Date)]