Question

我正在使用一个时间序列，在四个月的时间内连续测量环境空气中的臭氧浓度。每5分钟14秒进行一次测量。大约40000个数据点。

我开始在R中处理数据，但是由于缺乏技能而遇到了一些问题。

我的数据框是日期（以字符为单位）和臭氧浓度（以数值为单位）。

     Date                     O3_ppb
   2018-05-26 17:55:58 UTC    33.95161
   2018-05-26 18:01:12 UTC    35.12605 
   2018-05-26 18:06:28 UTC    36.03172 
   2018-05-26 18:11:42 UTC    36.81590
   2018-05-26 18:16:57 UTC    37.11235 
   2018-05-26 18:22:12 UTC    37.26945

我希望根据一个月的数据来说明24小时内臭氧浓度的每日变化。意思是我希望在24小时内每5分钟平均每月一次。

我的想法是，我需要以某种方式在24小时内每5分钟将数据分组。例如00：00：00、00：05：00、00：10：00… 但是由于测量中存在偏差，因此在一天的00:05:00进行的测量将在第二天的00:06:20进行，依此类推。而且由于传感器在重新启动时进入一次，因此每天的观察次数也会有所波动。

我的问题：是否有一个函数或循环能够将我的数据分为5分钟的间隔，并且还可以考虑漂移，因此例如在00:02:30-00:07:30之间进行的测量归类为00:05:00和00:07:30 – 00:12:30归为00:10:00分组？

很抱歉，如果这完全无法理解，但我是R和一般编程人员的新手。我真的希望有人可以帮助我，这样我就可以启动项目

Answer 1

这是一种使用重叠联接（data.table）的foverlaps()方法

library( data.table )

dt <- fread(' Date                     O3_ppb
            "2018-05-26 17:55:58"    33.95161
            "2018-05-26 18:01:12"    35.12605 
            "2018-05-26 18:06:28"    36.03172 
            "2018-05-26 18:11:42"    36.81590
            "2018-05-26 18:16:57"    37.11235 
            "2018-05-26 18:22:12"    37.26945', header = TRUE)

#set to posix
dt[, Date := as.POSIXct( Date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC") ]
#create dummy variables to join on later
dt[, `:=`( Start = Date, Stop = Date ) ]

#create data.table with periods you wish to summarise on later
#notice the +/- 150 (=00:02:30) to set a 5 minute 'bandwidth' around the period.
dt.period <- data.table( period = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
                                  as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
                                  by = "5 mins"),
                         Start = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 150,
                                      as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 150 ,
                                      by = "5 mins"),
                         Stop = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) + 150,
                                     as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) + 150,
                                     by = "5 mins") )

#perform overlap join
#first set keys
setkey(dt.period, Start, Stop)
#then perform join
result <- foverlaps( dt, dt.period, type = "within", nomatch = NA )
#summarise
result[, .( O3_ppb_avg = mean( O3_ppb, na.rm = TRUE ) ), by = .(period) ]

输出

#                 period O3_ppb_avg
# 1: 2018-05-26 17:55:00   33.95161
# 2: 2018-05-26 18:00:00   35.12605
# 3: 2018-05-26 18:05:00   36.03172
# 4: 2018-05-26 18:10:00   36.81590
# 5: 2018-05-26 18:15:00   37.11235
# 6: 2018-05-26 18:20:00   37.26945

Answer 2

这是使用lubridate的一种方法，无论时间如何，它都舍入到最接近的5分钟。

# Load data
library(tidyverse); library(lubridate)
df <- read.table(header = T, stringsAsFactors = F,
text = "Date                     O3_ppb
   '2018-05-26 17:55:58 UTC'    33.95161
   '2018-05-26 18:01:12 UTC'    35.12605 
   '2018-05-26 18:06:28 UTC'    36.03172 
   '2018-05-26 18:11:42 UTC'    36.81590
   '2018-05-26 18:16:57 UTC'    37.11235 
   '2018-05-26 18:22:12 UTC'    37.26945") %>%
  mutate(Date = ymd_hms(Date))


df2 <- df %>%
     # By adding 2.5 min = 150 sec and rounding down, we get closest 5 min
     mutate(Date_rnd = floor_date(Date + 150, "5 minutes"),

     # One option is to group by decimal time of day 
     group    = hour(Date_rnd) + minute(Date_rnd)/60,

     # ...or could convert that to a time on a single day, in this case today
     group_as_datetime = floor_date(Sys.time(), "1 day") + group*60*60)

输出

> df2
#                 Date   O3_ppb            Date_rnd    group   group_as_datetime
#1 2018-05-26 17:55:58 33.95161 2018-05-26 17:55:00 17.91667 2019-01-05 17:55:00
#2 2018-05-26 18:01:12 35.12605 2018-05-26 18:00:00 18.00000 2019-01-05 18:00:00
#3 2018-05-26 18:06:28 36.03172 2018-05-26 18:05:00 18.08333 2019-01-05 18:05:00
#4 2018-05-26 18:11:42 36.81590 2018-05-26 18:10:00 18.16667 2019-01-05 18:10:00
#5 2018-05-26 18:16:57 37.11235 2018-05-26 18:15:00 18.25000 2019-01-05 18:15:00
#6 2018-05-26 18:22:12 37.26945 2018-05-26 18:20:00 18.33333 2019-01-05 18:20:00

臭氧时间序列

2 个答案:

输出