Question

我是R的新手并且是我第一次遇到的困难。我有一个大约1,0000的数据集。 365天，我捕捉事件的发生。这种情况仅在每个月的前14天标出。我希望通过对相应月份（按小时）的先前事件进行平均来补充额外的16天。

结构如下：

                    day           hours      occurrence
                    2000-01-01     1          5
                    2000-01-01     2          6
                    2000-01-01     3          7
                    ...            ...        ...
                    2000-01-01     23         3
                    2000-01-01     24         2
                    ...            ...        ...
                    2000-01-02     1          4
                    2000-01-02     2          2
                    2000-01-02     3          5
                    ...            ...        ...
                    2000-01-02     23         2
                    2000-01-02     24         1
                    ...
                    ...
                    2000-01-15     1          average of the previous 1 hours((5+4+n)/2*k))
                    2000-01-15     2          average of the previous 2 hours ((6+2+n)/2*k))
                    2000-01-15     3          average of the previous 3 hours((7+5+n)/2*k))
                    ...            ...         ...
                    2000-01-15     23         average of the previous 23 hours
                    2000-01-15     24         average of the previous 24 hours
                    ...            ...         ...
                    ...            ...         ...
                    2000-01-30
                    2000-01-30
                    2000-01-30
                    2000-01-30
                    ...            ...         ...
                    ...            ...         ...
                    2000-02-01
                    2000-02-01
                    2000-02-01
                    2000-02-01
                    ...            ...         ...
                    ...
                    ...            ...         ...
                    2000-12-24

我试过了

               aggregate( occurences ~ hours, mean)

但结果毫无意义，我试过

               tapply( X = occurences, INDEX = list(hours), FUN = Mean )

不幸的是，两者都没有像我想象的那样起作用。我认为有必要将相应的月份包含在函数中。然而，我的手段似乎有限。

Answer 1

你可以试试这个。请注意，为了使示例更小，我每月只选择第1-4天和0-1小时的数据。第1天＆amp;每个月2个有发生数据，第2天和第2天3缺少发生的数据。

library(dplyr)

# create dummy data
set.seed(123) # for reproducibility of sample

d1 <- data.frame(time = seq(from = as.POSIXct("2000-01-01"), 
                            to = as.POSIXct("2000-02-28"),
                            by = "hour"))
d1 <- d1 %>%
  mutate(hour = as.integer(format(time, "%H")),
         day = as.integer(format(time, "%d")), # <~~ only needed to generate sample data
         month = as.integer(format(time, "%m")),
         occurence = sample(1:10, length(time), replace = TRUE),
         occurence = ifelse(day %in% 1:2, occurence, NA)) %>%  # <~~~ data only for day 1-2
  filter(hour %in% 0:1 & day %in% 1:4) %>%  # <~~~ smaller example: select hour 0-1, day 1-4
  select(-day)

# calculate mean occurrence per month and hour
d2 <- d1 %>%
  group_by(month, hour) %>%
  summarise(mean_occ = round(mean(occurence, na.rm = TRUE), 1))
d2
#   month hour mean_occ
# 1     1    0      5.0
# 2     1    1      8.0
# 3     2    0      5.5
# 4     2    1      6.5


# replace missing occurrence with mean_occ
d3 <- d1 %>%
  left_join(d2, by = c("hour", "month")) %>%
  mutate(occurence2 = ifelse(is.na(occurence), mean_occ, occurence)) %>%
  select(-month, -mean_occ)

d3
#    hour                time occurence occurence2
# 1     0 2000-01-01 00:00:00         3        3.0
# 2     1 2000-01-01 01:00:00         8        8.0
# 3     0 2000-01-02 00:00:00         7        7.0
# 4     1 2000-01-02 01:00:00         8        8.0
# 5     0 2000-01-03 00:00:00        NA        5.0
# 6     1 2000-01-03 01:00:00        NA        8.0
# 7     0 2000-01-04 00:00:00        NA        5.0
# 8     1 2000-01-04 01:00:00        NA        8.0
# 9     0 2000-02-01 00:00:00         4        4.0
# 10    1 2000-02-01 01:00:00         6        6.0
# 11    0 2000-02-02 00:00:00         7        7.0
# 12    1 2000-02-02 01:00:00         7        7.0
# 13    0 2000-02-03 00:00:00        NA        5.5
# 14    1 2000-02-03 01:00:00        NA        6.5
# 15    0 2000-02-04 00:00:00        NA        5.5
# 16    1 2000-02-04 01:00:00        NA        6.5

Answer 2

与@Henrik略有不同的方法：

library(lubridate)
library(data.table)
##
setDT(Df)
Df[,month:=month(days)]
Df[,year:=year(days)]
##
naDf <- Df[mday(days)>14,]
subDf <- Df[mday(days)<=14,]
##
avgDf <- subDf[
  ,
  list(occurrence=mean(occurrence)),
  by="month,year"]
##
naDf <- base::merge(
  x=naDf[,list(days,hours,month,year)],
  y=avgDf,
  by=c("month","year"))
newDf <- rbind(
  subDf,naDf,
  use.names=TRUE)[order(days,hours),]

数据：我只使用了一年的数据，但这应该适用于较长时间的窗口，因为聚合和连接是基于年和月完成的。

d0 <- as.Date("2000-01-01") set.seed(123) ## Df <- data.frame( days=rep(d0+0:364,each=24), hours=rep(1:24,365), occurrence=sample(1:15,24*365,replace=TRUE))

就像理智检查一样：

Df[mday(days)>14, occurrence:=NA] Df[,datetime:=as.POSIXct( days,tz="GMT")+3600*(4+hours)] ## newDf[,datetime:=as.POSIXct( days,tz="GMT")+3600*(4+hours)] ## library(ggplot2) ggplot( data=newDf[200:800,], aes(x=datetime,y=occurrence))+ geom_line(color="red") ggplot( data=Df[200:800,], aes(x=datetime,y=occurrence))+ geom_line()

我使用了行的子集（200：800），所以这些情节并没有太拥挤。

如何在一段时间内按小时平均？

2 个答案: