我正在尝试计算沿非常规日期序列的某些数据的均值。例如,我有一天中特定时间段的分钟水平数据,并且我有兴趣计算5分钟平均值。但是,我不确定在指定为列表时rollapply中的width
参数如何工作。
library(tidyverse)
library(zoo)
length = 16
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
# Create a "discontinuity"
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
# Add some noise
dxf$date <- dxf$date + runif(length, 0, 1)
diff(dxf$date)
dxf %>%
arrange(date) %>%
mutate(
diff = c(as.numeric(diff(date)), NA),
mean = rollapply(value, width = 5, mean, partial = TRUE, align = "left")
)
# This is what I need. Therefore, I need a variable width but adjusting to the discontinuity in the rows.
mean1 <- mean(dxf$value[1:5])
mean2 <- mean(dxf$value[2:6])
mean3 <- mean(dxf$value[3:7])
mean4 <- NA # Only have 4 values mean(dxf$value[4:7])
mean5 <- NA # Only have 3 values mean(dxf$value[5:7])
mean6 <- NA # Only have 2 values mean(dxf$value[6:7])
mean7 <- NA # Only have 1 values mean(dxf$value[7:7])
mean8 <- mean(dxf$value[7:11])
etc.
答案 0 :(得分:1)
我认为这是一个棘手的问题。这是一种方法
1从第一个日期时间到最后一个日期时间生成一个1分钟的序列
2进行插值,因此每1分钟有一个值。这包括在不连续处进行插值
3根据1分钟的内插值计算连续5分钟的平均值
4删除原始日期时间值之间的距离过大的值
还请注意时区,最好将其设置为lubridate功能默认情况下故意选择的值或UTC。
library(tidyverse)
library(RcppRoll)
library(lubridate)
dxf <- tibble(
date = seq(from = ymd_hms('2019-08-14 09:06:05'), by = "59 sec", length.out = 30),
value = runif(30)
)
dxf$date[15:30] <- dxf$date[15:30] + 3600*24 # discontinuing
dxf$date <- dxf$date + round(runif(30)) # noise
dxf <- dxf %>%
mutate(date = ymd_hms(date),
date_num = as.numeric(date),
diff = date_num - lag(date_num))
discontinuity <- which(dxf$diff > 70)
n = nrow(dxf)
date_seq <- seq(from = dxf$date_num[1], to = dxf$date_num[n], by = 60) # create a 1 min sequence
value_interp = approx(x = dxf$date_num, y = dxf$value, xout = date_seq) # interpolate values for the 5 min sequence
df <- tibble(
date = as_datetime(date_seq),
mean_value = RcppRoll::roll_mean(value_interp$y, n = 5, fill = NA, align = 'left'))
df %>%
filter(date < dxf$date[discontinuity - 1] | date > dxf$date[discontinuity])
答案 1 :(得分:0)
我们可以提取日期并将其分组,然后使用rollmean
library(dplyr)
dxf %>%
mutate(d1 = as.Date(date)) %>%
group_by(d1) %>%
mutate(mean = zoo::rollmean(value, 5, align = "left", fill = NA)) %>%
ungroup %>%
select(-d1)
# date value mean
# <dttm> <dbl> <dbl>
# 1 2019-08-14 12:49:09 0.507 0.404
# 2 2019-08-14 12:50:08 0.307 0.347
# 3 2019-08-14 12:51:07 0.427 0.341
# 4 2019-08-14 12:52:07 0.693 NA
# 5 2019-08-14 12:53:06 0.0851 NA
# 6 2019-08-14 12:54:05 0.225 NA
# 7 2019-08-14 12:55:04 0.275 NA
# 8 2019-08-15 12:56:02 0.272 0.507
# 9 2019-08-15 12:57:01 0.616 0.476
#10 2019-08-15 12:58:01 0.430 0.472
#11 2019-08-15 12:59:00 0.652 0.457
#12 2019-08-15 12:59:58 0.568 0.413
#13 2019-08-15 13:00:58 0.114 NA
#14 2019-08-15 13:01:56 0.596 NA
#15 2019-08-15 13:02:56 0.358 NA
#16 2019-08-15 13:03:54 0.429 NA
数据
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
dxf$date <- dxf$date + runif(length, 0, 1)
答案 2 :(得分:0)
此处w[i]
是date
的元素数,该元素数小于或等于date[i] + 300
减去i - 1
,请注意300表示300秒。
date <- dxf$date
w <- findInterval(date + 300, date) - seq_along(date) + 1
rollapply(dxf$value, w, mean, align = "left") * ifelse(w < 5, NA, 1)
# same
sapply(seq_along(w), function(i) mean(dxf$value[seq(i, length = w[i])])) *
ifelse(w < 5, NA, 1)