Question

下面是一段代码。它给出了滚动15分钟（历史）窗口的交易价格水平的百分位数。如果长度为500或1000，它会快速运行，但正如您所看到的，有45K观测值，而对于整个数据，它非常慢。我可以应用任何plyr功能吗？欢迎任何其他建议。

这是贸易数据的样子：

> str(trade)
'data.frame':   45571 obs. of  5 variables:
 $ time    : chr  "2013-10-20 22:00:00.489" "2013-10-20 22:00:00.807" "2013-10-20 22:00:00.811" "2013-10-20 22:00:00.811" ...
 $ prc     : num  121 121 121 121 121 ...
 $ siz     : int  1 4 1 2 3 3 2 2 3 4 ...
 $ aggress : chr  "B" "B" "B" "B" ...
 $ time.pos: POSIXlt, format: "2013-10-20 22:00:00.489" "2013-10-20 22:00:00.807" "2013-10-20 22:00:00.811" "2013-10-20 22:00:00.811" ...

这就是新列交易$ time.pos

之后数据的样子

trade$time.pos <- strptime(trade$time, format="%Y-%m-%d %H:%M:%OS") 

> head(trade)
                     time      prc siz aggress                time.pos
1 2013-10-20 22:00:00.489 121.3672   1       B 2013-10-20 22:00:00.489
2 2013-10-20 22:00:00.807 121.3750   4       B 2013-10-20 22:00:00.807
3 2013-10-20 22:00:00.811 121.3750   1       B 2013-10-20 22:00:00.811
4 2013-10-20 22:00:00.811 121.3750   2       B 2013-10-20 22:00:00.811
5 2013-10-20 22:00:00.811 121.3750   3       B 2013-10-20 22:00:00.811
6 2013-10-20 22:00:00.811 121.3750   3       B 2013-10-20 22:00:00.811

#t_15_index function returns the indices of the trades that were executed in last 15 minutes from the current trade(t-15 to t).
t_15_index <- function(data_vector,index) {
  which(data_vector[index] - data_vector[1:index]<=15*60)
}

get_percentile <- function(data) {
  len_d <- dim(trade)[1]  

  price_percentile = vector(length=len_d)  

  for(i in 1: len_d) {   

    t_15 = t_15_index(trade$time.pos,i)
    #ecdf(rep(..)) gets the empirical distribution of the the trade size on a particular trade-price level
    price_dist = ecdf(rep(trade$prc[t_15],trade$siz[t_15]))
    #percentile of the current price level depending on current (t-15 to t) subset of data
    price_percentile[i] = price_dist(trade$prc[i])
  }
  trade$price_percentile = price_percentile
  trade
}


res_trade = get_percentile(trade)

Answer 1

可能有一种方法可以加速滚动应用程序，但是由于窗口大小的变化，我认为标准工具（例如rollapply）不起作用，但也许有些人更熟悉它们会有想法。在此期间，您可以优化您的百分位数计算。您可以直接计算一个合适的近似值，而不是使用ecdf来创建具有所有相关开销的函数：

> vec <- rnorm(10000, 0, 3)
> val <- 5
> max(which(sort(vec) < val)) / length(vec)
[1] 0.9543
> ecdf(vec)(val)
[1] 0.9543
> microbenchmark(max(which(sort(vec) < val)) / length(vec))
Unit: milliseconds
expr      min       lq   median       uq      max neval
max(which(sort(vec) < val))/length(vec) 1.093434 1.105231 1.116364 1.141204 1.449141   100
> microbenchmark(ecdf(vec)(val))
Unit: milliseconds
expr      min       lq   median       uq      max neval
ecdf(vec)(val) 2.552946 2.808041 3.043579 3.439269 4.208202   100

大约2.5倍的改善。对于较小的样品，改进更大。

Answer 2

好吧，这个问题让我感兴趣。这些是我做的事情：

用自定义百分位计算替换ecdf
将time.pos更改为数字（因为它始终以秒为单位），因为[.POSIXct与[
将t_15_index更改为仅回溯到上一个最早的时间戳，因为数据已排序，因此我们无需一直回顾索引1。

这就是结果：

> system.time(res2 <- get_percentile2(trade))
  user  system elapsed 
14.458   0.768  15.215 
> system.time(res1 <- get_percentile(trade))
   user  system elapsed 
110.851  17.974 128.736

证明输出是相同的：

tail(sort(abs(res1$price_percentile - res2$price_percentile)), 50)
#  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# [45] 0 0 0 0 0 0

大约8.5倍的改善。请注意，如果每15分钟间隔的项目较少，则此改进会更大。这是在24小时内填补45K点数。因此，如果您的45K实际上超过几天，您可以期待更多的改进。这是代码：

t_15_index2 <- function(data_vector,index, min.index) {
  which(data_vector[index] - data_vector[min.index:index]<=minutes*60) + min.index - 1L
}
get_percentile2 <- function(trade) {
  len_d <- dim(trade)[1]  
  price_percentile = vector(length=len_d)
  min.index <- 1  
  for(i in 1: len_d) {
    t_15 = t_15_index2(trade$time.pos.2,i, min.index)
    vec <- rep(trade$prc[t_15],trade$siz[t_15])
    price_percentile[i] <- max(0, which(sort(vec) <= trade$prc[i])) / length(vec)
    min.index <- t_15[[1]]
  }
  trade$price_percentile = price_percentile
  trade
}

这是数据

start <- as.numeric(as.POSIXct("2013-01-01"))
end <- as.numeric(as.POSIXct("2013-01-02"))
set.seed(1)
minutes <- 15
ticks <- 45000
times <- sort(as.POSIXct(runif(ticks, start, end), origin=as.POSIXct("1970-01-01")))
trade <- data.frame(
  time=as.character(times),
  prc=100 + rnorm(ticks, 0, 5),
  siz=sample(1:10, ticks, rep=T),
  time.pos=times,
  time.pos.2=as.numeric(times)
)

最后，原来的功能略有修改，但基本相同：

t_15_index <- function(data_vector,index) {
  which(data_vector[index] - data_vector[1:index]<=minutes*60)
}
get_percentile <- function(trade) {
  len_d <- dim(trade)[1]    
  price_percentile = vector(length=len_d)  
  for(i in 1: len_d) {       
    t_15 = t_15_index(trade$time.pos,i)
    price_dist = ecdf(rep(trade$prc[t_15],trade$siz[t_15]))
    price_percentile[i] = price_dist(trade$prc[i])
  }
  trade$price_percentile = price_percentile
  trade
}

此外，ddply肯定无济于事。我也不认为data.table会有太多帮助，因为大型矢量的主要查找是用整数索引完成的。它可能会有所帮助。我可以稍后再检查一下。

# Version that pulls whole 2000 entries each time
sub.vec <- numeric(2000)
system.time(r1 <- for(i in seq_len(length(vec) - 2000)) sub.vec <- vec[i:(i+1999)])
#  user  system elapsed 
# 17.507   4.723  22.211 

# Version that overwrites 1 value at a time keeping the most recent 2K
sub.vec <- numeric(2001) # need to make this slightly larger because of 2000 %% 2000 == 0
system.time(r2 <- for(i in seq_len(length(vec) - 2000)) sub.vec[[(i %% 2000) + 1]] <- vec[[i]])

#  user  system elapsed 
# 2.642   0.009   2.650 

all.equal(r1, tail(r2, -1L))
# [1] TRUE

最后，如果你这样做，你也可以提出聪明的方法来快速重新计算百分位数，因为你知道你的15分钟包含的内容，添加的内容以及删除的内容。

不是100％确定执行FIFO 15分钟窗口所需的簿记是否会最终克服这样做的好处。

Answer 3

这是一个快速的方法，可以更有效地找到十五分钟前发生的时间索引：

# Create some sample data (modified from BrodieG)
set.seed(1)

ticks <- 45000
start <- as.numeric(as.POSIXct("2013-01-01"))
end <- as.numeric(as.POSIXct("2013-01-02"))
times <- as.POSIXct(runif(ticks, start, end), origin=as.POSIXct("1970-01-01"))
trade <- data.frame(
  time = sort(times),
  prc = 100 + rnorm(ticks, 0, 5),
  siz = sample(1:10, ticks, rep = T)
)

# For vector of times, find the index of the first time that was at least
# fifteen minutes before the current time. Assumes times are sorted
minutes_ago <- function(time, minutes = 15) {
  secs <- minutes * 60
  time <- as.numeric(time)
  out <- integer(length(time))

  before <- 1

  for(i in seq_along(out)) {
    while(time[before] < time[i] - secs) {
      before <- before + 1
    }
    out[i] <- before

  }
  out
}
system.time(minutes_ago(trade$time))
# Takes about 0.2s on my machine

library(Rcpp)
cppFunction("IntegerVector minutes_ago2(NumericVector time, int minutes = 15) {
  int secs = minutes * 60;
  int n = time.size();
  IntegerVector out(n);

  int before = 0;
  for (int i = 0; i < n; ++i) {
    # Could do even better here with a binary search
    while(time[before] < time[i] - secs) {
      before++;
    }
    out[i] = before + 1;
  }
  return out;
}")

system.time(minutes_ago2(trade$time, 10))
# Takes less than < 0.001

all.equal(minutes_ago(trade$time, 15), minutes_ago2(trade$time, 15))

Answer 4

如果要在dplyr中使用ecdf，请在mutate中使用seq_along / sapply以获得更快的结果

y <- x %>% mutate(percentile.score = sapply(seq_along(score), function(i){sum(score[1:i] <= score[i])/i}))

需要更快的滚动应用功能，启动停止指数

4 个答案: