R中的组替换缺失值

时间:2018-05-25 15:25:17

标签: r dataframe dplyr data.table tidyr

如何分别替换每个组的缺失值?

可重复的例子:

mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group.2 = c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L, 
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA, 
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L, 
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA, 
-22L))

现在我找到了如何在没有组的情况下替换缺失值。

library(dplyr)
mydata %>% mutate_at(vars(starts_with("x1")), funs(ifelse(is.na(.) & is.numeric(.) ,mean(., na.rm = TRUE),.)))

但我需要分别替换每个组(group1,group2)。

编辑小数据集

structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), group.2 = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 
2L, 2L, 2L), x1 = c(63L, 67L, 57L, NA, 65L, 75L, 57L, 80L, 42L, 
NA, 35L, 80L), x2 = c(46L, 1L, NA, 41L, 80L, NA, 74L, 73L, NA, 
13L, 83L, NA)), class = "data.frame", row.names = c(NA, -12L))

1 个答案:

答案 0 :(得分:1)

mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group2 = c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L, 
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA, 
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L, 
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA, 
-22L))


library(tidyverse)

mydata %>%
  unite(group, group1, group2) %>%        # combine groups
  mutate(id = row_number()) %>%           # add the row number as an id (useful when reshaping)
  gather(var, value, -group, -id) %>%     # reshape data
  group_by(group, var) %>%                # for each group combination and variable
  mutate(value = ifelse(is.na(value), mean(value, na.rm = T), value)) %>%   # replace NAs with mean
  spread(var, value) %>%                  # reshape again
  arrange(id) %>%                         # keep order of original dataset
  select(-id) %>%                         # remove id
  ungroup() %>%                           # forget the grouping
  separate(group, c("group1","group2"))   # split the groups again

# # A tibble: 22 x 4
#   group1 group2    x1    x2
#   <chr>  <chr>  <dbl> <dbl>
# 1 1      1       20    44  
# 2 1      2        4    94  
# 3 1      1       91    61.3
# 4 1      2       36.5   1  
# 5 1      1       94    67  
# 6 1      2       69    39  
# 7 1      1       38    73  
# 8 1      2       36.5  22  
# 9 2      1       29    44  
# 10 2      2       69    24  
# # ... with 12 more rows