我试图用var2和var3的平均值来填充var1中的空白,但是我无法使它正常工作。到目前为止,这是我尝试过的:
df <- data.frame(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3))
df$var2 <- c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6)
df$var3 <- c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)
for(i in 1:length(df$var1)) {
ifelse(is.na(df$var1[i]), df$var1[i] <- mean(df$var2[i], df$var3[i]), df$var1[i] == df$var1[i])
}
我不确定自己在做什么错。运行代码后,var1仍显示空白单元格。
非常感谢您的帮助
答案 0 :(得分:0)
尝试一下:
df <- data.frame(var1 = c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
stringsAsFactors = FALSE)
df[df==""] <- "NA"
for (i in 1:length(df$var1)) {
if (df$var1[i]== "NA") {
df$var1[i] = rowMeans(df[i, 2:3])
} else {
df$var1[i] = df$var1[i]
}
}
或者:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "NA", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
或者,也可以不将空格重新定义为“ NA”(如上例中的文本所示),而可以将其保留为空白,而跳过df[df==""] <- "NA"
位:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
或将空白标识为“真实” NA:
df[df==""] <- NA
for (i in 1:length(df[,1])) {
ifelse (is.na(df[i,1]), df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
答案 1 :(得分:0)
没有任何循环的另一种方式:
library(dplyr)
df %>%
mutate_at(vars(var1:var3), as.numeric) %>%
mutate(var1 = case_when(is.na(var1) ~ (var2+var3)/2, TRUE ~ var1))
#> var1 var2 var3
#> 1 1.0 1 4
#> 2 2.0 8 1
#> 3 5.0 9 1
#> 4 3.0 1 4
#> 5 3.0 1 4
#> 6 5.5 5 6
#> 7 7.5 8 7
#> 8 2.0 8 8
#> 9 2.0 3 9
#> 10 6.0 2 10
#> 11 7.0 0 11
#> 12 3.0 9 12
#> 13 8.5 4 13
#> 14 9.0 4 14
#> 15 11.0 7 15
#> 16 3.0 3 16
#> 17 3.0 5 17
#> 18 11.0 5 18
#> 19 12.0 2 19
#> 20 2.0 4 20
#> 21 13.5 6 21
#> 22 3.0 6 22
答案 2 :(得分:0)
我将在这里使用data.table方法。它应该可以处理较大的数据,并且可以避免在不需要的地方循环数据。
library(data.table)
dt <- data.table(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22))
dt[, var1 := as.numeric(var1)]
dt[is.na(var1), var1 := apply(.SD, 1, mean), .SDcols =c("var2", "var3")]
dt
var1 var2 var3
1: 1.0 1 4
2: 2.0 8 1
3: 5.0 9 1
4: 3.0 1 4
5: 3.0 1 4
6: 5.5 5 6
7: 7.5 8 7
8: 2.0 8 8
9: 2.0 3 9
10: 6.0 2 10
11: 7.0 0 11
12: 3.0 9 12
13: 8.5 4 13
14: 9.0 4 14
15: 11.0 7 15
16: 3.0 3 16
17: 3.0 5 17
18: 11.0 5 18
19: 12.0 2 19
20: 2.0 4 20
21: 13.5 6 21
22: 3.0 6 22