使用月平均值填充缺失值

时间:2018-01-17 15:51:29

标签: r date

我每月的时间序列数据大约有10年。问题是系列中缺少值(NA)很少。 我的目标是用系列中所有特定月份的平均值填充缺失值。 例如。我错过了1994年7月(以及其他月份)的价值,我想用系列中所有其他7月份的平均数替换这个缺失值。 该系列如下所示。

Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")

value <- c(2.20000, 0.70000, 2.80000, 0.80000,1.20000, 1.50000, 0.90000 ,     NA, 0.80000, 1.70000, 2.40000,
 1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
  3.00000, 3.60000, 4.90000,      NA, 4.90000, 0.70000, 0.3000, 0.70000 ,0.37000,0.20000, 0.05000,
  0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000 0.27000 0.33000,
 0.10000, 0.36000, 2.32000, 0.1900, 0.2300, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000,0.30000)

矢量值只是系列的摘录,而不是整个系列。

所有人都知道如何实现这一目标?

4 个答案:

答案 0 :(得分:0)

与此类似 - R: Replacing NA values by mean of hour with dplyr

Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")
# generate random data
value <- runif(length(Date))
# make some NULLs
value[runif(3,min=1,max=length(value))] <- NA

df <- data.frame(Date=Date,value=value)

library(dplyr)
library(lubridate)

out <- df %>% 
  mutate(Month=month(Date)) %>% 
  group_by(Month) %>%
  mutate(monthly_average = mean(value, na.rm=TRUE),
           new_value= replace(value, is.na(value), mean(value, na.rm=TRUE)))

答案 1 :(得分:0)

您的样本数据长度不一样,因此我创建了一些。方法是按月分组,如果缺少,则计算平均值。如果您的数据中存在趋势,您可能需要不同的东西,但这应该符合您的要求:

Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")
set.seed(123)
df <- data.frame(date = Date, 
                 value = rnorm(n = length(Date)))
# Add 10% missing data:
df$value[sample(x = seq_len(nrow(df)), size = nrow(df)/10)] <- NA

library(dplyr)
df %>% 
  # group by month:
  group_by(month = format(date, "%m")) %>% 
  # replace with mean, if missing:
  mutate(value_complete = ifelse(is.na(value), mean(value, na.rm = TRUE), value))

答案 2 :(得分:0)

我稍微调整了一下您的初始数据,以便能够使用两列创建数据框:

Date <- seq(as.Date("1993-12-01"), as.Date("1998-06-01"), by = "1 month")

value <- c(2.20000, 0.70000, 2.80000, 0.80000, 1.20000, 1.50000, 0.90000,      NA, 0.80000, 1.70000, 2.40000,
           1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
           3.00000, 3.60000, 4.90000,      NA, 4.90000, 0.70000, 0.30000, 0.70000 ,0.37000, 0.20000, 0.05000,
           0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000, 0.27000, 0.33000,
           0.10000, 0.36000, 2.32000, 0.19000, 0.23000, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000, 0.30000)

mydf <- data.frame(Date, value)

# Get the month and add it as a column
mydf$month <- months(mydf$Date)

# Calculate the average by month
avg_by_month <- with(mydf, aggregate(value, by=list(month=month), FUN=mean, na.rm=TRUE))

# Merge averaged values to the data frame
mydf2 <- merge(x = mydf, y = avg_by_month, by = "month")

# Recuperate the average where needed
mydf2$value[is.na(mydf2$value)] <- mydf2$x[is.na(mydf2$value)]

# Discard average if not needed anymore
mydf2$x <- NULL

答案 3 :(得分:0)

为了完整起见,此处还提供data.table解决方案,该解决方案仅更新NA个值并避免附加month列:

library(data.table)
avg_by_month <- setDT(DT)[, mean(value, na.rm = TRUE), by = month(date)]
DT[is.na(value), value := avg_by_month[month == .BY, V1], by = month(date)]

&#34;技巧&#34;这里是,按NA对包含month(date)值的行进行分组,并使用分组变量.BY查找月平均值。

可重复的例子

OP已为value提供了两个NA值。使用了这些,但第三个值设置为NA以获得更好的测试用例:

value <- c(2.20000,      NA, 2.80000, 0.80000, 1.20000, 1.50000, 0.90000,      NA, 0.80000, 1.70000, 2.40000,
           1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
           3.00000, 3.60000, 4.90000,      NA, 4.90000, 0.70000, 0.30000, 0.70000, 0.37000, 0.20000, 0.05000,
           0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000, 0.27000, 0.33000,
           0.10000, 0.36000, 2.32000, 0.19000, 0.23000, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000, 0.30000)
library(data.table)
DT <- data.table(
  value,
  date = seq(as.Date("1993-12-01"), length.out = length(value), by = "1 month"))

# find indices of NA values
na_idx <- DT[is.na(value), which = TRUE]
DT[na_idx]
   value       date
1:    NA 1994-01-01
2:    NA 1994-07-01
3:    NA 1996-01-01
avg_by_month <- setDT(DT)[, mean(value, na.rm = TRUE), by = month(date)]
DT[is.na(value), value := avg_by_month[month == .BY, V1], by = month(date)]

DT[na_idx]
      value       date
1: 1.566667 1994-01-01
2: 2.310000 1994-07-01
3: 1.566667 1996-01-01