替换缺失值

时间:2019-08-31 06:05:05

标签: r dplyr

比方说,我有一个数据框,其中包含某些季度的销售额,而以下几个季度的值均缺失。我想用一个简单的公式替换NA(如下所示的mutate / dplyr)。问题是我不想多次使用mutate。如何同时为所有NA做到这一点?有办法吗?

structure(list(Period = c("1999Q1", "1999Q2", "1999Q3", "1999Q4", 
"2000Q1", "2000Q2", "2000Q3", "2000Q4", "2001Q1", "2001Q2", "2001Q3", 
"2001Q4", "2002Q1", "2002Q2", "2002Q3", "2002Q4", "2003Q1", "2003Q2", 
"2003Q3", "2003Q4"), Sales= c(353.2925571, 425.9299841, 357.5204626, 
363.80247, 302.8081066, 394.328576, 435.15573, 387.99768, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
-20L))

test %>%
      mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
      mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
      mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales))

3 个答案:

答案 0 :(得分:4)

一种dplyrtidyr的可能性是:

df %>%
 group_by(quarter = substr(Period, 5, 6)) %>%
 mutate(Sales_temp = replace_na(Sales, last(na.omit(Sales)))) %>%
 group_by(quarter, na = is.na(Sales)) %>%
 mutate(constant = 1.05,
        Sales_temp = Sales_temp * cumprod(constant),
        Sales = coalesce(Sales, Sales_temp)) %>%
 ungroup() %>%
 select(1:2)

   Period Sales
   <chr>  <dbl>
 1 1999Q1  353.
 2 1999Q2  426.
 3 1999Q3  358.
 4 1999Q4  364.
 5 2000Q1  303.
 6 2000Q2  394.
 7 2000Q3  435.
 8 2000Q4  388.
 9 2001Q1  318.
10 2001Q2  414.
11 2001Q3  457.
12 2001Q4  407.
13 2002Q1  334.
14 2002Q2  435.
15 2002Q3  480.
16 2002Q4  428.
17 2003Q1  351.
18 2003Q2  456.
19 2003Q3  504.
20 2003Q4  449.

或仅使用dplyr

df %>%
 group_by(quarter = substr(Period, 5, 6)) %>%
 mutate(Sales_temp = if_else(is.na(Sales), last(na.omit(Sales)), Sales)) %>%
 group_by(quarter, na = is.na(Sales)) %>%
 mutate(constant = 1.05,
        Sales_temp = Sales_temp * cumprod(constant),
        Sales = coalesce(Sales, Sales_temp)) %>%
 ungroup() %>%
 select(1:2)

答案 1 :(得分:1)

x <- test$Sales

# find that last non-NA data
last.valid <- tail(which(!is.na(x)),1)

# store the "base"
base <- ceiling(last.valid/4)*4 + (-3:0)
base <- base + ifelse(base > last.valid, -4, 0)
base <- x[base]


# calculate the "exponents"
expos <- ceiling( ( seq(length(x)) - last.valid ) / 4 )

test$Sales <- ifelse(is.na(x), bases * 1.05 ^ expos, x)

tail(test)

#    Period    Sales
# 15 2002Q3 479.7592
# 16 2002Q4 427.7674
# 17 2003Q1 350.5382
# 18 2003Q2 456.4846
# 19 2003Q3 503.7472
# 20 2003Q4 449.1558

答案 2 :(得分:0)

这是另一个基本解决方案:

non_nas <- na.omit(test$Sales)
nas <- length(attr(non_nas, 'na.action'))

test$Sales <- c(non_nas, #keep non_nas
                 tail(non_nas, 4) * 1.05 ^(rep(1:floor(nas / 4), each = 4, length.out = nas)))

test