有几个关于在R中进行滚动总和的询问,但没有一个完全解决了问题的复杂性。我很想知道如何在具有以下特征的数据集上找到最快的方法:
•至少5,000行
•数据至少延长15年
•组数是总行数的很大一部分
•每个组的时间序列数据不完整
•任何一天的任何一组都可以有多个观察结果
滚动和问题的综合解决方案必须能够稳健地处理这些特征中的每一个。我已经对3种不同的解决方案进所有解决方案计算7天滚动总和,每个解决方案的结果相同。据我所知,plyr :: ddply函数是获取滚动总和的最快方法,因此我对优化此策略和任何代码感兴趣,而不是更快地完成此类任务。我的R代码和基准测试结果如下:
library(data.table)
library(dplyr)
library(lubridate)
library(zoo)
library(plyr)
library(RcppRoll)
library(microbenchmark)
# Creating dataset
set.seed(8)
df_250_groups=data.table(id=seq.int(5000),
date=sample(seq(as.Date('2000/01/01'), as.Date('2017/01/01'), by="day"),5000,replace=T),
type=sample(seq.int(250),5000,replace=T),
n=1)
# Using plyr::ddply
plyr_ddply=function(data){
if(dim(data)[1] != 0){
data$rolling_7_day_sum = NA
for (i in 1:(dim(data)[1])) {
data$rolling_7_day_sum[i]=sum(data[which(as.numeric(data$date[i]-data$date) %in% 0:6),]$n) # compute rolling sum in for loop
}
}
return(data)
}
# Using RcppRoll::roll_sumr
RcppRoll_roll_sumr=function(df_250_groups) {
data=list()
for(i in 1:length(unique(df_250_groups$type))){
data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
}
data1=rbindlist(data)
aggregated=df_250_groups[,.N,by=c("type","date")]
df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
df2[is.na(N),N:=0]
setorder(df2,date)
df2[,rolling_7_day_sum:=roll_sumr(N,7,fill=NA),by=type] # compute rolling sum in for loop
df3rcp=merge(df_250_groups,df2,by=c("type","date"))
return(df3rcp)
}
# Using zoo::rollsumr
zoo_rollsumr=function(df_250_groups) {
data=list()
for(i in 1:length(unique(df_250_groups$type))){
data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
}
data1=rbindlist(data)
aggregated=df_250_groups[,.N,by=c("type","date")]
df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
df2[is.na(N),N:=0]
setorder(df2,date)
df2[,rolling_7_day_sum:=rollsumr(N,7,sum,fill=NA),by=type] # compute rolling sum in for loop
df3zoo=merge(df_250_groups,df2,by=c("type","date"))
return(df3zoo)
}
mb_250_groups=microbenchmark(
plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F),
RcppRoll_roll_sumr(df_250_groups),
zoo_rollsumr(df_250_groups),
times = 25L,
unit = "s"
)
print(mb_250_groups)
Unit: seconds
expr min lq mean median uq max neval
plyr_ddply_df 1.258333 1.262470 1.279760 1.266951 1.282107 1.352914 25
RcppRoll_roll_sumr(df_250_groups) 1.628959 1.660497 1.714138 1.709180 1.756196 1.866871 25
zoo_rollsumr(df_250_groups) 2.155310 2.193149 2.264942 2.247413 2.269310 2.702394 25
# Executing functions to verify that these rolling functions work correctly
plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F)
RcppRoll_roll_sumr_df=RcppRoll_roll_sumr(df_250_groups)
zoo_rollsumr_df=zoo_rollsumr(df_250_groups)