R_如何用另一两栏的平均值填充一栏的空白?

时间:2018-10-11 19:28:05

标签: r

我试图用var2和var3的平均值来填充var1中的空白,但是我无法使它正常工作。到目前为止,这是我尝试过的:

df <- data.frame(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3))
df$var2 <- c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6)
df$var3 <- c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)


 for(i in 1:length(df$var1)) {
   ifelse(is.na(df$var1[i]), df$var1[i] <- mean(df$var2[i], df$var3[i]), df$var1[i] == df$var1[i])
 }

我不确定自己在做什么错。运行代码后,var1仍显示空白单元格。

非常感谢您的帮助

3 个答案:

答案 0 :(得分:0)

尝试一下:

df <- data.frame(var1 = c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
                 var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
                 var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
                 stringsAsFactors = FALSE)
df[df==""] <- "NA"

for (i in 1:length(df$var1)) {
  if (df$var1[i]== "NA") {
    df$var1[i] = rowMeans(df[i, 2:3])
  } else {
    df$var1[i] = df$var1[i]
  }
}

或者:

for (i in 1:length(df[,1])) {
  ifelse (df[i,1] == "NA", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}  

或者,也可以不将空格重新定义为“ NA”(如上例中的文本所示),而可以将其保留为空白,而跳过df[df==""] <- "NA"位:

for (i in 1:length(df[,1])) {
  ifelse (df[i,1] == "", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
} 

或将空白标识为“真实” NA:

df[df==""] <- NA

for (i in 1:length(df[,1])) {
  ifelse (is.na(df[i,1]), df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}

答案 1 :(得分:0)

没有任何循环的另一种方式:

library(dplyr)

df %>% 
  mutate_at(vars(var1:var3), as.numeric) %>%
  mutate(var1 = case_when(is.na(var1) ~ (var2+var3)/2, TRUE ~ var1))
#>    var1 var2 var3
#> 1   1.0    1    4
#> 2   2.0    8    1
#> 3   5.0    9    1
#> 4   3.0    1    4
#> 5   3.0    1    4
#> 6   5.5    5    6
#> 7   7.5    8    7
#> 8   2.0    8    8
#> 9   2.0    3    9
#> 10  6.0    2   10
#> 11  7.0    0   11
#> 12  3.0    9   12
#> 13  8.5    4   13
#> 14  9.0    4   14
#> 15 11.0    7   15
#> 16  3.0    3   16
#> 17  3.0    5   17
#> 18 11.0    5   18
#> 19 12.0    2   19
#> 20  2.0    4   20
#> 21 13.5    6   21
#> 22  3.0    6   22

答案 2 :(得分:0)

我将在这里使用data.table方法。它应该可以处理较大的数据,并且可以避免在不需要的地方循环数据。

library(data.table)
dt <- data.table(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
                  var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
                  var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22))
dt[, var1 := as.numeric(var1)]
dt[is.na(var1), var1 := apply(.SD, 1, mean), .SDcols =c("var2", "var3")]

dt


    var1 var2 var3
 1:  1.0    1    4
 2:  2.0    8    1
 3:  5.0    9    1
 4:  3.0    1    4
 5:  3.0    1    4
 6:  5.5    5    6
 7:  7.5    8    7
 8:  2.0    8    8
 9:  2.0    3    9
10:  6.0    2   10
11:  7.0    0   11
12:  3.0    9   12
13:  8.5    4   13
14:  9.0    4   14
15: 11.0    7   15
16:  3.0    3   16
17:  3.0    5   17
18: 11.0    5   18
19: 12.0    2   19
20:  2.0    4   20
21: 13.5    6   21
22:  3.0    6   22