如何以10分钟为步长汇总数据

时间:2020-01-15 09:41:29

标签: r time-series

我有一个具有不同时间步长的数据框,我想将其转换为偶数时间步长。每10分钟应写入一个值,如果没有新值,则应采用前一个值(请参阅2019-01-01 01:00:00和2019-01-01 02:30:00)。

                date        ZUL_T
1   2019-01-01 00:04:00     23.3
2   2019-01-01 00:15:00     23.3
3   2019-01-01 00:26:00     19.9
4   2019-01-01 00:37:00     20.7
5   2019-01-01 00:48:00     21.9
6   2019-01-01 00:59:00     21.9
7   2019-01-01 01:10:00     18.8
8   2019-01-01 01:22:00     18.8
9   2019-01-01 01:33:00     20.7
10  2019-01-01 01:44:00     21.6
11  2019-01-01 01:55:00     19.2
12  2019-01-01 02:06:00     19.2
13  2019-01-01 02:17:00     19.6
14  2019-01-01 02:29:00     19.6
15  2019-01-01 02:40:00     20.5

这是我当前的代码,但是如果DS中没有值,则会缺少一些时间步骤。

library(lubridate)

lowtime <- min(DS$date)
hightime <- max(DS$date)

# Set the minute and second to the nearest 10 minute value
minute(lowtime) <- floor(minute(lowtime)/10) * 10
minute(hightime) <- ceiling(minute(hightime)/10) * 10
second(lowtime) <- 0
second(hightime) <- 0

# Set the breakpoints at 10 minute intervals
breakpoints <- seq.POSIXt(lowtime, hightime, by = 600)
ZUL_T <- aggregate(ZUL_T ~ cut(date, breaks = breakpoints), DS, mean)


> data
                        date                       ZUL_T
1                       2019-01-01 00:00:00        23.3
2                       2019-01-01 00:10:00        23.3
3                       2019-01-01 00:20:00        19.9
4                       2019-01-01 00:30:00        20.7
5                       2019-01-01 00:40:00        21.9
6                       2019-01-01 00:50:00        21.9
7                       2019-01-01 01:10:00        18.8
8                       2019-01-01 01:20:00        18.8
9                       2019-01-01 01:30:00        20.7
10                      2019-01-01 01:40:00        21.6
11                      2019-01-01 01:50:00        19.2
12                      2019-01-01 02:00:00        19.2
13                      2019-01-01 02:10:00        19.6
14                      2019-01-01 02:20:00        19.6
15                      2019-01-01 02:40:00        20.5

2 个答案:

答案 0 :(得分:0)

您可以将mergebreakpoints作为数据框。

# first, you probably need 10 min later in time 
minute(hightime) <- ceiling((minute(max(DS$date)) + 10)/10) * 10
breakpoints <- seq.POSIXt(lowtime, hightime, by=600)

使用经典列表符号中的aggregate来获取专有名称。

ZUL_T <- aggregate(list(ZUL_T=DS$ZUL_T), list(date=cut(DS$date, breaks=breakpoints)), mean)

现在merge

ZUL_T <- merge(transform(ZUL_T, date=as.character(date)), 
               data.frame(date=as.character(breakpoints[-length(breakpoints)]), 
       stringsAsFactors=F), 
               all=TRUE)

并用值-1替换NA值。

ZUL_T$ZUL_T[is.na(ZUL_T$ZUL_T)] <- ZUL_T$ZUL_T[which(is.na(ZUL_T$ZUL_T)) - 1]
ZUL_T
#                   date ZUL_T
# 1  2019-01-01 00:00:00  23.3
# 2  2019-01-01 00:10:00  23.3
# 3  2019-01-01 00:20:00  19.9
# 4  2019-01-01 00:30:00  20.7
# 5  2019-01-01 00:40:00  21.9
# 6  2019-01-01 00:50:00  21.9
# 7  2019-01-01 01:00:00  21.9
# 8  2019-01-01 01:10:00  18.8
# 9  2019-01-01 01:20:00  18.8
# 10 2019-01-01 01:30:00  20.7
# 11 2019-01-01 01:40:00  21.6
# 12 2019-01-01 01:50:00  19.2
# 13 2019-01-01 02:00:00  19.2
# 14 2019-01-01 02:10:00  19.6
# 15 2019-01-01 02:20:00  19.6
# 16 2019-01-01 02:30:00  19.6
# 17 2019-01-01 02:40:00  20.5

答案 1 :(得分:0)

我们可以使用floor_date包中的lubridate来每10分钟减少一次时间,并取一个下限,按它和sum ZUL_T值分组。

library(dplyr)
library(lubridate)
library(tidyr)

df %>%
  group_by(date = floor_date(ymd_hms(date), "10 mins")) %>%
  summarise(ZUL_T = sum(ZUL_T)) 


#   date                ZUL_T
#   <dttm>              <dbl>
# 1 2019-01-01 00:00:00  23.3
# 2 2019-01-01 00:10:00  23.3
# 3 2019-01-01 00:20:00  19.9
# 4 2019-01-01 00:30:00  20.7
# 5 2019-01-01 00:40:00  21.9
# 6 2019-01-01 00:50:00  21.9
# 7 2019-01-01 01:10:00  18.8
# 8 2019-01-01 01:20:00  18.8
# 9 2019-01-01 01:30:00  20.7
#10 2019-01-01 01:40:00  21.6
#11 2019-01-01 01:50:00  19.2
#12 2019-01-01 02:00:00  19.2
#13 2019-01-01 02:10:00  19.6
#14 2019-01-01 02:20:00  19.6
#15 2019-01-01 02:40:00  20.5

,然后使用completefill完成丢失的组合,并用先前的值填充NA的值。

df %>%
 group_by(date = floor_date(ymd_hms(date), "10 mins")) %>%
 summarise(ZUL_T = sum(ZUL_T)) 
 complete(date = seq(min(date), max(date), "10 mins")) %>%
 fill(ZUL_T)


#   date                ZUL_T
#   <dttm>              <dbl>
# 1 2019-01-01 00:00:00  23.3
# 2 2019-01-01 00:10:00  23.3
# 3 2019-01-01 00:20:00  19.9
# 4 2019-01-01 00:30:00  20.7
# 5 2019-01-01 00:40:00  21.9
# 6 2019-01-01 00:50:00  21.9
# 7 2019-01-01 01:00:00  21.9
# 8 2019-01-01 01:10:00  18.8
# 9 2019-01-01 01:20:00  18.8
#10 2019-01-01 01:30:00  20.7
#11 2019-01-01 01:40:00  21.6
#12 2019-01-01 01:50:00  19.2
#13 2019-01-01 02:00:00  19.2
#14 2019-01-01 02:10:00  19.6
#15 2019-01-01 02:20:00  19.6
#16 2019-01-01 02:30:00  19.6
#17 2019-01-01 02:40:00  20.5

数据

df <- structure(list(date = structure(1:15, .Label = c("2019-01-01 00:04:00", 
"2019-01-01 00:15:00", "2019-01-01 00:26:00", "2019-01-01 00:37:00", 
"2019-01-01 00:48:00", "2019-01-01 00:59:00", "2019-01-01 01:10:00", 
"2019-01-01 01:22:00", "2019-01-01 01:33:00", "2019-01-01 01:44:00", 
"2019-01-01 01:55:00", "2019-01-01 02:06:00", "2019-01-01 02:17:00", 
"2019-01-01 02:29:00", "2019-01-01 02:40:00"), class = "factor"), 
ZUL_T = c(23.3, 23.3, 19.9, 20.7, 21.9, 21.9, 18.8, 18.8, 
20.7, 21.6, 19.2, 19.2, 19.6, 19.6, 20.5)), 
class = "data.frame", row.names = c(NA,-15L))