Question

我很好奇是否有人可以提出一种（更快）的方法来计算可变时间间隔（窗口）的滚动统计（滚动均值，中位数，百分位数等）。

也就是说，假设有一个随机定时观察（即不是每日，或每周数据，观察只有一个时间戳，如在滴答数据中），并假设你想看看你的中心和离散度统计能够扩大和收紧计算这些统计数据的时间间隔。

我做了一个简单的for循环来做到这一点。但它显然运行得非常慢（实际上我认为我的循环仍在运行在我设置的一小部分数据样本上以测试其速度）。我一直试图让ddply这样做 - 这对于每日统计数据来说似乎是不可能的 - 但我似乎无法摆脱它。

示例：

样本设置：

df <- data.frame(Date = runif(1000,0,30))
df$Price <- I((df$Date)^0.5 * (rnorm(1000,30,4)))
df$Date <- as.Date(df$Date, origin = "1970-01-01")

示例函数（在许多观察中运行速度非常慢

SummaryStats <- function(dataframe, interval){
  # Returns daily simple summary stats, 
  # at varying intervals
  # dataframe is the data frame in question, with Date and Price obs
  # interval is the width of time to be treated as a day

  firstDay <- min(dataframe$Date)
  lastDay  <- max(dataframe$Date)
  result <- data.frame(Date = NULL,
                       Average = NULL,  Median = NULL,
                       Count = NULL,
                       Percentile25 = NULL, Percentile75 = NULL)

  for (Day in firstDay:lastDay){

    dataframe.sub = subset(dataframe,
                Date > (Day - (interval/2))
                & Date < (Day + (interval/2)))

    nu = data.frame(Date = Day, 
                    Average = mean(dataframe.sub$Price),
                    Median = median(dataframe.sub$Price),
                    Count = length(dataframe.sub$Price),
                    P25 = quantile(dataframe.sub$Price, 0.25),
                    P75 = quantile(dataframe.sub$Price, 0.75))

    result = rbind(result,nu)

  }

  return(result)

}

欢迎您的建议！

Answer 1

如果速度是您的主要关注点，那么

Rcpp是一个很好的方法。我将使用滚动均值统计来举例说明。

基准：Rcpp与R

x = sort(runif(25000,0,4*pi))
y = sin(x) + rnorm(length(x),0.5,0.5)
system.time( rollmean_r(x,y,xout=x,width=1.1) )   # ~60 seconds
system.time( rollmean_cpp(x,y,xout=x,width=1.1) ) # ~0.0007 seconds

Rcpp和R函数的代码

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {
    double total=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);

    for( i=0; i<nout; i++ ) {
      while( x[ redge ] - xout[i] <= width && redge<n ) 
        total += y[redge++];
      while( xout[i] - x[ ledge ] > width && ledge<n ) 
        total -= y[ledge++];
      if( ledge==redge ) { out[i]=NAN; total=0; continue; }
      out[i] = total / (redge-ledge);
    }
    return out;
  }')

rollmean_r = function(x,y,xout,width) {
  out = numeric(length(xout))
  for( i in seq_along(xout) ) {
    window = x >= (xout[i]-width) & x <= (xout[i]+width)
    out[i] = .Internal(mean( y[window] ))
  }
  return(out)
}

现在解释rollmean_cpp。 x和y是数据。 xout是请求滚动统计的点向量。 width是滚动窗口的宽度* 2。请注意，滑动窗口末端的indeces存储在ledge和redge中。这些基本上是指向x和y中各个元素的指针。这些indeces可能非常有利于调用其他C ++函数（例如，中位数等），它们将向量和起始和结束的indeces作为输入。

对于那些需要“{verbose”版本的rollmean_cpp进行调试（冗长）的人：

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {

    double total=0, oldtotal=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);


    for( i=0; i<nout; i++ ) {
      Rcout << "Finding window "<< i << " for x=" << xout[i] << "..." << std::endl;
      total = 0;

      // numbers to push into window
      while( x[ redge ] - xout[i] <= width && redge<n ) {
        Rcout << "Adding (x,y) = (" << x[redge] << "," << y[redge] << ")" ;
        Rcout << "; edges=[" << ledge << "," << redge << "]" << std::endl;
        total += y[redge++];
      }

      // numbers to pop off window
      while( xout[i] - x[ ledge ] > width && ledge<n ) {
        Rcout << "Removing (x,y) = (" << x[ledge] << "," << y[ledge] << ")";
        Rcout << "; edges=[" << ledge+1 << "," << redge-1 << "]" << std::endl;
        total -= y[ledge++];
      }
      if(ledge==n) Rcout << " OVER ";
      if( ledge==redge ) {
       Rcout<<" NO DATA IN INTERVAL " << std::endl << std::endl;
       oldtotal=total=0; out[i]=NAN; continue;}

      Rcout << "For interval [" << xout[i]-width << "," <<
               xout[i]+width << "], all points in interval [" << x[ledge] <<
               ", " << x[redge-1] << "]" << std::endl ;
      Rcout << std::endl;

      out[i] = ( oldtotal + total ) / (redge-ledge);
      oldtotal=total+oldtotal;
    }
    return out;
  }')

x = c(1,2,3,6,90,91)
y = c(9,8,7,5.2,2,1)
xout = c(1,2,2,3,6,6.1,13,90,100)
a = rollmean_cpp(x,y,xout=xout,2)
# Finding window 0 for x=1...
# Adding (x,y) = (1,9); edges=[0,0]
# Adding (x,y) = (2,8); edges=[0,1]
# Adding (x,y) = (3,7); edges=[0,2]
# For interval [-1,3], all points in interval [1, 3]
# 
# Finding window 1 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 2 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 3 for x=3...
# For interval [1,5], all points in interval [1, 3]
# 
# Finding window 4 for x=6...
# Adding (x,y) = (6,5.2); edges=[0,3]
# Removing (x,y) = (1,9); edges=[1,3]
# Removing (x,y) = (2,8); edges=[2,3]
# Removing (x,y) = (3,7); edges=[3,3]
# For interval [4,8], all points in interval [6, 6]
# 
# Finding window 5 for x=6.1...
# For interval [4.1,8.1], all points in interval [6, 6]
# 
# Finding window 6 for x=13...
# Removing (x,y) = (6,5.2); edges=[4,3]
# NO DATA IN INTERVAL 
# 
# Finding window 7 for x=90...
# Adding (x,y) = (90,2); edges=[4,4]
# Adding (x,y) = (91,1); edges=[4,5]
# For interval [88,92], all points in interval [90, 91]
# 
# Finding window 8 for x=100...
# Removing (x,y) = (90,2); edges=[5,5]
# Removing (x,y) = (91,1); edges=[6,5]
# OVER  NO DATA IN INTERVAL 

print(a)
# [1] 8.0 8.0 8.0 8.0 5.2 5.2 NaN 1.5 NaN

Answer 2

让我们看看......你正在做一个循环（在R中非常慢），在创建子集时创建不必要的数据副本，并使用rbind来累积数据集。如果你避免这些，事情将会大大加快。试试这个......

Summary_Stats <- function(Day, dataframe, interval){
    c1 <- dataframe$Date > Day - interval/2 & 
        dataframe$Date < Day + interval/2
    c(
        as.numeric(Day),
        mean(dataframe$Price[c1]),
        median(dataframe$Price[c1]),
        sum(c1),
        quantile(dataframe$Price[c1], 0.25),
        quantile(dataframe$Price[c1], 0.75)
      )
}
Summary_Stats(df$Date[2],dataframe=df, interval=20)
firstDay <- min(df$Date)
lastDay  <- max(df$Date)
system.time({
    x <- sapply(firstDay:lastDay, Summary_Stats, dataframe=df, interval=20)
    x <- as.data.frame(t(x))
    names(x) <- c("Date","Average","Median","Count","P25","P75")
    x$Date <- as.Date(x$Date)
})
dim(x)
head(x)

Answer 3

在上面回答我对“凯文”的问题时，我想我在下面找到了一些东西。

此函数获取刻度数据（时间观察以随机间隔进入并由时间戳指示）并计算一个间隔的平均值。

library(Rcpp)

cppFunction('
  NumericVector rollmean_c2( NumericVector x, NumericVector y, double width,
                              double Min, double Max) {

double total = 0, redge,center;
unsigned int n = (Max - Min) + 1,
                  i, j=0, k, ledge=0, redgeIndex;
NumericVector out(n);


for (i = 0; i < n; i++){
  center = Min + i + 0.5;
  redge = center - width / 2;
  redgeIndex = 0;
  total = 0;

  while (x[redgeIndex] < redge){
    redgeIndex++;
  }
  j = redgeIndex;

  while (x[j] < redge + width){
    total += y[j++];

  }

  out[i] = total / (j - redgeIndex);
}
return out;

  }')

# Set up example data
x = seq(0,4*pi,length.out=2500)
y = sin(x) + rnorm(length(x),0.5,0.5)
plot(x,y,pch=20,col="black",
     main="Sliding window mean; width=1",
     sub="rollmean_c in red      rollmean_r overlaid in white.")


c.out = rollmean_c2(x,y,width=1,Min = min(x), Max = max(x)) 
lines(0.5:12.5,c.out,col="red",lwd=3)

enter image description here

Answer 4

将所有连接点视为链条。将此链视为图形，其中每个数据点都是一个节点。然后，对于每个节点，我们希望找到距离w或更远的所有其他节点。为此，我首先生成一个给出成对距离的矩阵。第n行给出了节点n节点分开的距离。

# First, some data
x = sort(runif(25000,0,4*pi))
y = sin(x) + rnorm(length(x),0,0.5)

# calculate the rows of the matrix one by one
# until the distance between the two closest nodes is greater than w
# This algorithm is actually faster than `dist` because it usually stops
# much sooner
dl = list()
dl[[1]] = diff(x)
i = 1
while( min(dl[[i]]) <= w ) {
  pdl = dl[[i]]
  dl[[i+1]] = pdl[-length(pdl)] + dl[[1]][-(1:i)]
  i = i+1
}

# turn the list of the rows into matrices
rarray = do.call( rbind, lapply(dl,inf.pad,length(x)) )
larray = do.call( rbind, lapply(dl,inf.pad,length(x),"right") )

# extra function
inf.pad = function(x,size,side="left") {
  if(side=="left") {
    x = c( x, rep(Inf, size-length(x) ) )
  } else {
    x = c( rep(Inf, size-length(x) ), x )
  }
  x
}

然后我使用矩阵来确定每个窗口的边缘。在本例中，我设置了w=2。

# How many data points to look left or right at each data point
lookr = colSums( rarray <= w )
lookl = colSums( larray <= w )

# convert these "look" variables to indeces of the input vector
ri = 1:length(x) + lookr
li = 1:length(x) - lookl

通过定义窗口，使用*apply函数获得最终答案非常简单。

rolling.mean = vapply( mapply(':',li,ri), function(i) .Internal(mean(y[i])), 1 )

以上所有代码在我的计算机上花了大约50秒。这比我的其他答案中的rollmean_r函数快一点。但是，这里特别好的是提供了indeces。然后，您可以使用*apply函数使用您喜欢的任何R函数。例如，

rolling.mean = vapply( mapply(':',li,ri), 
                                        function(i) .Internal(mean(y[i])), 1 )

大约需要5秒钟。和，

rolling.median = vapply( mapply(':',li,ri), 
                                        function(i) median(y[i]), 1 )

大约需要14秒。如果你愿意，你可以在我的另一个答案中使用Rcpp函数来获取它们。

R - 在变量区间内计算滚动统计量的快速方法

4 个答案: