累积回顾滚动连接

时间:2017-05-18 02:26:07

标签: r data.table xts

我有两个数据表。我想做一个滚动连接但是“累积式”。 例如,这里有两个表

d1 <-  data.table(starttime = c("2011-01-01 15:29:50", "2011-01-01 15:30:03", "2011-01-01 15:40:20", "2011-01-01 15:50:20" ,"2011-01-01 16:30:00", "2011-01-01 16:40:00"),
              endtime = c("2011-01-01 15:30:00", "2011-01-01 15:30:15", "2011-01-01 15:40:28", "2011-01-01 15:50:25", "2011-01-01 16:31:00", "2011-01-01 16:41:00"), v = c("A", "B", "B", "A", "B", "A"), m = c(2,3,5,8,9,9), dur = c(10,12,8,5,60,11))

starttime               endtime           v   m    dur
2011-01-01 15:29:50  2011-01-01 15:30:00  A   2    10
2011-01-01 15:30:03  2011-01-01 15:30:15  B   3    12
2011-01-01 15:40:20  2011-01-01 15:40:28  B   5     8
2011-01-01 15:50:20  2011-01-01 15:50:25  A   8     5
2011-01-01 16:30:00  2011-01-01 16:31:00  B   9    60
2011-01-01 16:40:00  2011-01-01 16:41:00  A   9    11

d2 <-  data.table(time = c("2011-01-01 16:39:50", "2011-01-01 16:00:03", "2011-01-01 16:50:50"),
                            v = c("A", "B", "A"),  mk = rnorm(3))

               time   v          mk
2011-01-01 16:00:03   B   -0.2385093
2011-01-01 16:39:50   A   -0.4966836
2011-01-01 16:50:50   A   -0.4566836

现在对于d2中的第一行,考虑从第一行的d2 $时间回顾,我想得到在d1行中给定相同d2 $ v的m的总和,直到持续时间的总和(endtime-starttime)&gt; 15

还有一种方法可以计算我用来添加多少行&gt; 15秒?

基本上是这样的 因为我应该与此类似

               time   v       mk       m     rowsUsed 
2011-01-01 16:00:03   B   -0.2385093   8            2
2011-01-01 16:39:50   A   -0.4966836   10           2
2011-01-01 16:50:50   A   -0.4566836   17           2

有谁可以帮助我如何构建这样的滚动连接?我有很多行,所以速度是一个问题。愿意灵活使用XTS。

2 个答案:

答案 0 :(得分:3)

这是我的版本。你可以随意修改它。如果您发现它有用,请告诉我。

library("lubridate")
library("data.table")

d1 <-  data.table(starttime = parse_date_time(c("2011-01-01 15:29:50", "2011-01-01 15:30:03", "2011-01-01 15:40:20", "2011-01-01 15:50:20" ,"2011-01-01 16:30:00", "2011-01-01 16:40:00"), orders="ymd HMS"),
                  endtime = parse_date_time(c("2011-01-01 15:30:00", "2011-01-01 15:30:15", "2011-01-01 15:40:28", "2011-01-01 15:50:25", "2011-01-01 16:31:00", "2011-01-01 16:41:00"), orders="ymd HMS"), v = c("A", "B", "B", "A", "B", "A"), m = c(2,3,5,8,9,9), dur = c(10,12,8,5,60,11))

d2 <-  data.table(time = parse_date_time(c("2011-01-01 16:39:50", "2011-01-01 16:00:03", "2011-01-01 16:50:50"), orders="ymd HMS"),
                  v = c("A", "B", "A"),  mk = rnorm(3))

get_m_rows <- function(value,timeValue,threshold){
  d3 <- d1[v==value]
  d3 <- d3[order(endtime,decreasing = TRUE)]
  d3[endtime<timeValue,totalTime:=cumsum(dur)]
  eligibleRows <- d3[endtime<timeValue,.N]
  ifelse(d3[totalTime<=threshold&!is.na(totalTime),.N]>0,rowIndex <- d3[,.I[totalTime<=threshold&!is.na(totalTime)]],rowIndex <- 0)
  ifelse(rowIndex==0,rowIndex<-1,ifelse(length(rowIndex)<eligibleRows,rowIndex<-c(rowIndex,rowIndex[length(rowIndex)]+1),0)) 
  return(d3[rowIndex,.(m=sum(m),.N)])
}

d2[,c("m","rowUsed"):=(get_m_rows(v,time,15)),by=.(v,time)]

# time v         mk  m rowUsed
# 1: 2011-01-01 16:39:50 A -0.2025446 10       2
# 2: 2011-01-01 16:00:03 B  1.2363660  8       2
# 3: 2011-01-01 16:50:50 A  1.0222815 17       2

答案 1 :(得分:2)

试试这个,我解释一下这些评论,告诉我它是不清楚还是太慢。

library(data.table)
library(pbapply)

d1 <-  data.table(starttime = c("2011-01-01 15:29:50", "2011-01-01 15:30:03", "2011-01-01 15:40:20", "2011-01-01 15:50:20" ,"2011-01-01 16:30:00", "2011-01-01 16:40:00"),
                  endtime = c("2011-01-01 15:30:00", "2011-01-01 15:30:15", "2011-01-01 15:40:28", "2011-01-01 15:50:25", "2011-01-01 16:31:00", "2011-01-01 16:41:00"), v = c("A", "B", "B", "A", "B", "A"), m = c(2,3,5,8,9,9), dur = c(10,12,8,5,60,11))

d2 <-  data.table(time = c("2011-01-01 16:39:50", "2011-01-01 16:00:03", "2011-01-01 16:50:50"),
                  v = c("A", "B", "A"),  mk = rnorm(3))

d1$endtime <- as.POSIXct(d1$endtime)
d2$time <- as.POSIXct(d2$time)
d1 <- d1[order(d1$endtime,decreasing=TRUE),] # I want the more recent on top

output_list <- pbapply(d2,1,function(row){
  sub_d1 <- subset(d1,endtime <= row["time"] & v == row["v"]) # keep only relevant rows timewise and with correct v
  sub_d1$cumdur <- cumsum(sub_d1$dur) # sum the time to be able to limit ti be able to test this 15 sec limit
  rowsUsed <- nrow(sub_d1) - nrow(subset(sub_d1,cumdur >= 15)) + 1 # check the number of rows I need
  m <- sum(sub_d1$m[1:rowsUsed]) # sum the relevant m
  return(list(m,rowsUsed)) # return as list
  })

d2 <- cbind(d2, matrix(unlist(output_list),ncol=2,byrow=TRUE,dimnames = list(NULL,c("m","rowsUsed"))))

# time v          mk  m rowsUsed
# 1: 2011-01-01 16:39:50 A -0.01884752 10        2
# 2: 2011-01-01 16:00:03 B  0.08545874  8        2
# 3: 2011-01-01 16:50:50 A  1.62738391 17        2