rollapply如何“忽略”某些观察并使用可变宽度

时间:2019-08-13 22:01:54

标签: r rollapply

我正在尝试计算沿非常规日期序列的某些数据的均值。例如,我有一天中特定时间段的分钟水平数据,并且我有兴趣计算5分钟平均值。但是,我不确定在指定为列表时rollapply中的width参数如何工作。

library(tidyverse)
library(zoo)

length = 16

set.seed(10)

dxf <- data.frame(
  date = seq(Sys.time(), by = "59 sec", length.out = length),
  value = runif(length)
)

# Create a "discontinuity"
dxf$date[8:length] <- dxf$date[8:length] + 3600*24

# Add some noise
dxf$date <- dxf$date + runif(length, 0, 1)

diff(dxf$date)

dxf %>%
  arrange(date) %>%
  mutate(
    diff = c(as.numeric(diff(date)), NA),
    mean = rollapply(value, width = 5, mean, partial = TRUE, align = "left")
  )

# This is what I need. Therefore, I need a variable width but adjusting to the discontinuity in the rows.
mean1 <- mean(dxf$value[1:5])
mean2 <- mean(dxf$value[2:6])
mean3 <- mean(dxf$value[3:7])
mean4 <- NA # Only have 4 values mean(dxf$value[4:7])
mean5 <- NA # Only have 3 values mean(dxf$value[5:7])
mean6 <- NA # Only have 2 values mean(dxf$value[6:7])
mean7 <- NA # Only have 1 values mean(dxf$value[7:7])
mean8 <- mean(dxf$value[7:11])
etc.

3 个答案:

答案 0 :(得分:1)

我认为这是一个棘手的问题。这是一种方法

1从第一个日期时间到最后一个日期时间生成一个1分钟的序列

2进行插值,因此每1分钟有一个值。这包括在不连续处进行插值

3根据1分钟的内插值计算连续5分钟的平均值

4删除原始日期时间值之间的距离过大的值

还请注意时区,最好将其设置为lubridate功能默认情况下故意选择的值或UTC。

library(tidyverse)
library(RcppRoll)
library(lubridate)



dxf <- tibble(
  date = seq(from = ymd_hms('2019-08-14 09:06:05'), by = "59 sec", length.out = 30),
  value = runif(30)
)

dxf$date[15:30] <- dxf$date[15:30] + 3600*24 # discontinuing
dxf$date <- dxf$date + round(runif(30)) # noise

dxf <- dxf %>% 
  mutate(date = ymd_hms(date),
         date_num = as.numeric(date),
         diff = date_num - lag(date_num))


discontinuity <- which(dxf$diff > 70)



n = nrow(dxf)

date_seq <- seq(from = dxf$date_num[1], to = dxf$date_num[n], by = 60) # create a 1 min sequence

value_interp = approx(x = dxf$date_num, y = dxf$value, xout = date_seq)  # interpolate values for the 5 min sequence

df <- tibble(
  date = as_datetime(date_seq),
  mean_value = RcppRoll::roll_mean(value_interp$y, n = 5, fill = NA, align = 'left'))

df %>% 
  filter(date < dxf$date[discontinuity - 1] | date > dxf$date[discontinuity])

答案 1 :(得分:0)

我们可以提取日期并将其分组,然后使用rollmean

library(dplyr)

dxf %>%
  mutate(d1 = as.Date(date)) %>%
  group_by(d1) %>%
  mutate(mean = zoo::rollmean(value, 5, align = "left", fill = NA)) %>%
  ungroup %>%
  select(-d1)


#   date                 value   mean
#   <dttm>               <dbl>  <dbl>
# 1 2019-08-14 12:49:09 0.507   0.404
# 2 2019-08-14 12:50:08 0.307   0.347
# 3 2019-08-14 12:51:07 0.427   0.341
# 4 2019-08-14 12:52:07 0.693     NA    
# 5 2019-08-14 12:53:06 0.0851    NA    
# 6 2019-08-14 12:54:05 0.225     NA    
# 7 2019-08-14 12:55:04 0.275     NA    
# 8 2019-08-15 12:56:02 0.272   0.507
# 9 2019-08-15 12:57:01 0.616   0.476
#10 2019-08-15 12:58:01 0.430   0.472
#11 2019-08-15 12:59:00 0.652   0.457
#12 2019-08-15 12:59:58 0.568   0.413
#13 2019-08-15 13:00:58 0.114     NA    
#14 2019-08-15 13:01:56 0.596     NA    
#15 2019-08-15 13:02:56 0.358     NA    
#16 2019-08-15 13:03:54 0.429     NA  

数据

set.seed(10)

dxf <- data.frame(
   date = seq(Sys.time(), by = "59 sec", length.out = length),
   value = runif(length)
)
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
dxf$date <- dxf$date + runif(length, 0, 1)

答案 2 :(得分:0)

此处w[i]date的元素数,该元素数小于或等于date[i] + 300减去i - 1,请注意300表示300秒。

date <- dxf$date
w <- findInterval(date + 300, date) - seq_along(date) + 1

rollapply(dxf$value, w, mean, align = "left") * ifelse(w < 5, NA, 1)

# same
sapply(seq_along(w), function(i) mean(dxf$value[seq(i, length = w[i])])) * 
  ifelse(w < 5, NA, 1)