我每月的时间序列数据大约有10年。问题是系列中缺少值(NA)很少。 我的目标是用系列中所有特定月份的平均值填充缺失值。 例如。我错过了1994年7月(以及其他月份)的价值,我想用系列中所有其他7月份的平均数替换这个缺失值。 该系列如下所示。
Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")
value <- c(2.20000, 0.70000, 2.80000, 0.80000,1.20000, 1.50000, 0.90000 , NA, 0.80000, 1.70000, 2.40000,
1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
3.00000, 3.60000, 4.90000, NA, 4.90000, 0.70000, 0.3000, 0.70000 ,0.37000,0.20000, 0.05000,
0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000 0.27000 0.33000,
0.10000, 0.36000, 2.32000, 0.1900, 0.2300, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000,0.30000)
矢量值只是系列的摘录,而不是整个系列。
所有人都知道如何实现这一目标?
答案 0 :(得分:0)
与此类似 - R: Replacing NA values by mean of hour with dplyr
Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")
# generate random data
value <- runif(length(Date))
# make some NULLs
value[runif(3,min=1,max=length(value))] <- NA
df <- data.frame(Date=Date,value=value)
library(dplyr)
library(lubridate)
out <- df %>%
mutate(Month=month(Date)) %>%
group_by(Month) %>%
mutate(monthly_average = mean(value, na.rm=TRUE),
new_value= replace(value, is.na(value), mean(value, na.rm=TRUE)))
答案 1 :(得分:0)
您的样本数据长度不一样,因此我创建了一些。方法是按月分组,如果缺少,则计算平均值。如果您的数据中存在趋势,您可能需要不同的东西,但这应该符合您的要求:
Date <- seq(as.Date("1993-12-01"), as.Date("2005-05-01"), by = "1 month")
set.seed(123)
df <- data.frame(date = Date,
value = rnorm(n = length(Date)))
# Add 10% missing data:
df$value[sample(x = seq_len(nrow(df)), size = nrow(df)/10)] <- NA
library(dplyr)
df %>%
# group by month:
group_by(month = format(date, "%m")) %>%
# replace with mean, if missing:
mutate(value_complete = ifelse(is.na(value), mean(value, na.rm = TRUE), value))
答案 2 :(得分:0)
我稍微调整了一下您的初始数据,以便能够使用两列创建数据框:
Date <- seq(as.Date("1993-12-01"), as.Date("1998-06-01"), by = "1 month")
value <- c(2.20000, 0.70000, 2.80000, 0.80000, 1.20000, 1.50000, 0.90000, NA, 0.80000, 1.70000, 2.40000,
1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
3.00000, 3.60000, 4.90000, NA, 4.90000, 0.70000, 0.30000, 0.70000 ,0.37000, 0.20000, 0.05000,
0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000, 0.27000, 0.33000,
0.10000, 0.36000, 2.32000, 0.19000, 0.23000, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000, 0.30000)
mydf <- data.frame(Date, value)
# Get the month and add it as a column
mydf$month <- months(mydf$Date)
# Calculate the average by month
avg_by_month <- with(mydf, aggregate(value, by=list(month=month), FUN=mean, na.rm=TRUE))
# Merge averaged values to the data frame
mydf2 <- merge(x = mydf, y = avg_by_month, by = "month")
# Recuperate the average where needed
mydf2$value[is.na(mydf2$value)] <- mydf2$x[is.na(mydf2$value)]
# Discard average if not needed anymore
mydf2$x <- NULL
答案 3 :(得分:0)
为了完整起见,此处还提供data.table
解决方案,该解决方案仅更新NA
个值并避免附加month
列:
library(data.table)
avg_by_month <- setDT(DT)[, mean(value, na.rm = TRUE), by = month(date)]
DT[is.na(value), value := avg_by_month[month == .BY, V1], by = month(date)]
&#34;技巧&#34;这里是,按NA
对包含month(date)
值的行进行分组,并使用分组变量.BY
查找月平均值。
OP已为value
提供了两个NA
值。使用了这些,但第三个值设置为NA
以获得更好的测试用例:
value <- c(2.20000, NA, 2.80000, 0.80000, 1.20000, 1.50000, 0.90000, NA, 0.80000, 1.70000, 2.40000,
1.60000, 4.00000, 4.40000, 2.50000, 1.10000, 2.40000, 1.10000, 3.70000, 6.40000, 7.00000, 2.90000,
3.00000, 3.60000, 4.90000, NA, 4.90000, 0.70000, 0.30000, 0.70000, 0.37000, 0.20000, 0.05000,
0.35000, 0.05000, 2.50000, 0.08000, 0.12000, 2.53000, 2.49000, 2.04000, 2.92000, 0.27000, 0.33000,
0.10000, 0.36000, 2.32000, 0.19000, 0.23000, 0.18000, 0.28000, 0.19500, 0.07300, 0.65000, 0.30000)
library(data.table)
DT <- data.table(
value,
date = seq(as.Date("1993-12-01"), length.out = length(value), by = "1 month"))
# find indices of NA values
na_idx <- DT[is.na(value), which = TRUE]
DT[na_idx]
value date 1: NA 1994-01-01 2: NA 1994-07-01 3: NA 1996-01-01
avg_by_month <- setDT(DT)[, mean(value, na.rm = TRUE), by = month(date)]
DT[is.na(value), value := avg_by_month[month == .BY, V1], by = month(date)]
DT[na_idx]
value date 1: 1.566667 1994-01-01 2: 2.310000 1994-07-01 3: 1.566667 1996-01-01