在数据框中填充缺少的值

时间:2018-07-09 16:05:16

标签: r dplyr na missing-data tidyr

嘿,我需要填写数据框的缺失值。逻辑很简单,如果M[i, j + 1]中有值,则使用M[i, j + 1],否则使用M[i, j - 1]。但是棘手的是,我需要填写缺失的值,因为从行的开始到每一行的最后一个非na值之后的列,不仅是非空单元格附近的单元格。

这是数据

a1 <- c('a',9,8,rep(NA,5))
a2 <- c('b',NA,NA,NA,NA,3,NA,4)
a3 <- c('c',11,6,7,NA,NA,NA,6)
M <- rbind(a1,a2,a3)
ind <- !is.na(M[,-1]) 
t <- tapply(M[,-1][ind], row(M[,-1])[ind], head, 1) 

M <- M %>%
 as.data.frame(stringsAsFactors = FALSE) %>%
 group_by(V1) %>%
 do(mutate(., last_non_na_col = max(apply(.,1,function(x) max(which(!is.na(x)))))))


 for (i in 1:nrow(M)) {         
  for (j in 3:(M$last_non_na_col[i]+1)) {      
    if (is.na(M[i,j])) { 
   M[i,j] = ifelse(!is.na(M[i,j+1]),M[i,j+1],(ifelse(!is.na(M[i,j-1]),M[i,j-1],t[i])))
 } }
 for (j in 2) { M[i,j] = ifelse(is.na(M[i,j]), M[i,j+1], M[i,j])}   

}

原始数据就是这样

   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
 a1 "a"  "9"  "8"  NA   NA   NA   NA   NA  
 a2 "b"  NA   NA   NA   NA   "3"  NA   "4" 
 a3 "c"  "11" "6"  "7"  NA   NA   NA   "6" 

我的代码输出如下,这是正确的。请注意,对于单元格M [2,5],填充值应为7(它之前的数字),而不是6(它之后的最近数字)。

 V1    V2    V3    V4    V5    V6    V7    V8    last_non_na_col
 <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>           <int>
1 a     9     8     8     NA    NA    NA    NA                  3
2 b     3     3     3     3     3     4     4                   8
3 c     11    6     7     7     7     6     6                   8

我在for循环中这样做。有人能帮我在dydyverse中做到这一点吗?

谢谢

凯西

1 个答案:

答案 0 :(得分:1)

有了tbl_df,我们可以使用tidyverse方法

library(tidyverse)
gather(M, key, val, -V1) %>%
     group_by(V1) %>%
     fill(val, .direction = 'up') %>% 
     mutate(val = replace(val, which(is.na(val))[1], 
                         val[tail(which(!is.na(val)), 1)])) %>% 
    spread(key, val)
# A tibble: 3 x 8
# Groups:   V1 [3]
#  V1    V2    V3    V4    V5    V6    V7    V8   
#  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a     9     8     8     NA    NA    NA    NA   
#2 b     3     3     3     3     3     4     4    
#3 c     11    6     7     5     5     6     6   

在OP的for循环中,我们可以使用na.locf(用zoo包中的相邻非NA元素填充NA元素)

library(zoo)
last_non_na_col <- c(3, 8, 8)

for (i in seq_len(nrow(M))) {
   M[i, -1] <- na.locf(unlist(M[i, -1]), fromLast = TRUE, na.rm = FALSE)
   for (j in 3:(pmin(ncol(M), last_non_na_col[i]+1))) {      
     if (is.na(M[i,j])) { 
       M[i,j] = ifelse(!is.na(M[i,j+1]), M[i,j+1], M[i,j-1])
     }
   }   
 } 

M
# A tibble: 3 x 8
# Groups:   V1 [3]
#  V1    V2    V3    V4    V5    V6    V7    V8   
#  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a     9     8     8     NA    NA    NA    NA   
#2 b     3     3     3     3     3     4     4    
#3 c     11    6     7     5     5     6     6    

注意:在这里,我们将last_non_na_col创建为vector,而不是在数据集中创建一个单独的列,以简化索引编制

数据

M <- structure(list(V1 = c("a", "b", "c"), V2 = c("9", NA, "11"), 
    V3 = c("8", NA, "6"), V4 = c(NA, NA, "7"), V5 = c(NA_character_, 
    NA_character_, NA_character_), V6 = c(NA, "3", "5"), V7 = c(NA_character_, 
    NA_character_, NA_character_), V8 = c(NA, "4", "6")), .Names = c("V1", 
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA, 
-3L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
  vars = "V1", drop = TRUE, indices = list(
    0L, 1L, 2L), group_sizes = c(1L, 1L, 1L), biggest_group_size = 1L, 
  labels = structure(list(
    V1 = c("a", "b", "c")), row.names = c(NA, -3L),
  class = "data.frame", vars = "V1", drop = TRUE, .Names = "V1"))