我为此感到挣扎。我需要在火车数据库中的列上创建NA计数和百分比的表(或数据框)。我正在尝试处理此循环:
df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("Variable_name", "%NA is_pass=1", "%NA is_pass=0"))
for (i in names(train)){
v <- c(
i,
round(sum(train[,c(i,"is_pass")]$is_pass == 1 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 1)*100,2),
round(sum(train[,c(i,"is_pass")]$is_pass == 0 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 0)*100,2)
)
df <- rbind(df,v)
}
但是,我仍然遇到此错误(对于所有变量,为简化起见,我只针对前两个变量:
Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_id"):
“invalid factor level, NA generated”
Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_type"):
“invalid factor level, NA generated”
查看数据:
> head(train)
id program_id program_type program_duration test_id test_type
1 9389_150 Y_1 Y 136 150 offline
2 16523_44 T_1 T 131 44 offline
3 13987_178 Z_2 Z 120 178 online
4 13158_32 T_2 T 117 32 offline
5 10591_84 V_3 V 131 84 offline
6 12531_23 T_3 T 134 23 offline
difficulty_level trainee_id gender education city_tier age
1 intermediate 9389 M Matriculation 3 24
2 easy 16523 F High School Diploma 4 26
3 easy 13987 M Matriculation 1 40
4 easy 13158 F Matriculation 3 NA
5 intermediate 10591 F High School Diploma 1 42
6 intermediate 12531 F High School Diploma 1 29
total_programs_enrolled is_handicapped trainee_engagement_rating is_pass
1 5 N 1 0
2 2 N 3 1
3 1 N 2 1
4 4 N 1 1
5 2 N 4 1
6 4 N 2 0
样本数据:
structure(list(program_id = structure(c(16L, 3L, 21L, 4L, 11L,
5L), .Label = c("S_1", "S_2", "T_1", "T_2", "T_3", "T_4", "U_1",
"U_2", "V_1", "V_2", "V_3", "V_4", "X_1", "X_2", "X_3", "Y_1",
"Y_2", "Y_3", "Y_4", "Z_1", "Z_2", "Z_3"), class = "factor"),
program_type = structure(c(6L, 2L, 7L, 2L, 4L, 2L), .Label = c("S",
"T", "U", "V", "X", "Y", "Z"), class = "factor"), program_duration = c(136L,
131L, 120L, 117L, 131L, 134L), test_id = c(150L, 44L, 178L,
32L, 84L, 23L), test_type = structure(c(1L, 1L, 2L, 1L, 1L,
1L), .Label = c("offline", "online"), class = "factor"),
difficulty_level = structure(c(3L, 1L, 1L, 1L, 3L, 3L), .Label = c("easy",
"hard", "intermediate", "vary hard"), class = "factor"),
trainee_id = c(9389L, 16523L, 13987L, 13158L, 10591L, 12531L
), gender = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("F",
"M"), class = "factor"), education = structure(c(4L, 2L,
4L, 4L, 2L, 2L), .Label = c("Bachelors", "High School Diploma",
"Masters", "Matriculation", "No Qualification"), class = "factor"),
city_tier = c(3L, 4L, 1L, 3L, 1L, 1L), age = c(24L, 26L,
40L, NA, 42L, 29L), total_programs_enrolled = c(5L, 2L, 1L,
4L, 2L, 4L), is_handicapped = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("N", "Y"), class = "factor"), trainee_engagement_rating = c(1L,
3L, 2L, 1L, 4L, 2L), is_pass = c(0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("program_id",
"program_type", "program_duration", "test_id", "test_type", "difficulty_level",
"trainee_id", "gender", "education", "city_tier", "age", "total_programs_enrolled",
"is_handicapped", "trainee_engagement_rating", "is_pass"), row.names = c(NA,
6L), class = "data.frame")
答案 0 :(得分:0)
这是一种方法,不确定我是否掌握所有数学知识。
sapply(names(train), function(nm) {
ones <- (train$is_pass == 1)
zeroes <- (train$is_pass == 0)
nas <- is.na(train[[nm]])
c(ones = sum(nas & ones) / sum(ones),
zeroes = sum(nas & zeroes) / sum(zeroes))
})
# program_id program_type program_duration test_id test_type difficulty_level trainee_id
# ones 0 0 0 0 0 0 0
# zeroes 0 0 0 0 0 0 0
# gender education city_tier age total_programs_enrolled is_handicapped
# ones 0 0 0 0.25 0 0
# zeroes 0 0 0 0.00 0 0
# trainee_engagement_rating is_pass
# ones 0 0
# zeroes 0 0
如果您希望使用data.frame
而不是上面的matrix
(带有行/列名称)的输出,请尝试:
do.call(rbind, lapply(names(train), function(nm) {
ones <- (train$is_pass == 1)
zeroes <- (train$is_pass == 0)
nas <- is.na(train[[nm]])
data.frame(nm = nm,
ones = sum(nas & ones) / sum(ones),
zeroes = sum(nas & zeroes) / sum(zeroes))
}))
# nm ones zeroes
# 1 program_id 0.00 0
# 2 program_type 0.00 0
# 3 program_duration 0.00 0
# 4 test_id 0.00 0
# 5 test_type 0.00 0
# 6 difficulty_level 0.00 0
# 7 trainee_id 0.00 0
# 8 gender 0.00 0
# 9 education 0.00 0
# 10 city_tier 0.00 0
# 11 age 0.25 0
# 12 total_programs_enrolled 0.00 0
# 13 is_handicapped 0.00 0
# 14 trainee_engagement_rating 0.00 0
# 15 is_pass 0.00 0
数据:
train <- structure(list(program_id = structure(c(16L, 3L, 21L, 4L, 11L,
5L), .Label = c("S_1", "S_2", "T_1", "T_2", "T_3", "T_4", "U_1",
"U_2", "V_1", "V_2", "V_3", "V_4", "X_1", "X_2", "X_3", "Y_1",
"Y_2", "Y_3", "Y_4", "Z_1", "Z_2", "Z_3"), class = "factor"),
program_type = structure(c(6L, 2L, 7L, 2L, 4L, 2L), .Label = c("S",
"T", "U", "V", "X", "Y", "Z"), class = "factor"), program_duration = c(136L,
131L, 120L, 117L, 131L, 134L), test_id = c(150L, 44L, 178L,
32L, 84L, 23L), test_type = structure(c(1L, 1L, 2L, 1L, 1L,
1L), .Label = c("offline", "online"), class = "factor"),
difficulty_level = structure(c(3L, 1L, 1L, 1L, 3L, 3L), .Label = c("easy",
"hard", "intermediate", "vary hard"), class = "factor"),
trainee_id = c(9389L, 16523L, 13987L, 13158L, 10591L, 12531L
), gender = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("F",
"M"), class = "factor"), education = structure(c(4L, 2L,
4L, 4L, 2L, 2L), .Label = c("Bachelors", "High School Diploma",
"Masters", "Matriculation", "No Qualification"), class = "factor"),
city_tier = c(3L, 4L, 1L, 3L, 1L, 1L), age = c(24L, 26L,
40L, NA, 42L, 29L), total_programs_enrolled = c(5L, 2L, 1L,
4L, 2L, 4L), is_handicapped = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("N", "Y"), class = "factor"), trainee_engagement_rating = c(1L,
3L, 2L, 1L, 4L, 2L), is_pass = c(0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("program_id",
"program_type", "program_duration", "test_id", "test_type", "difficulty_level",
"trainee_id", "gender", "education", "city_tier", "age", "total_programs_enrolled",
"is_handicapped", "trainee_engagement_rating", "is_pass"), row.names = c(NA,
6L), class = "data.frame")
答案 1 :(得分:0)
我们可以修复此循环,但这实际上是不必要且缓慢的方法。以下是一些替代方法:
# count NA
apply(train, 2, function(x) sum(is.na(x)))
# percent NA
apply(train, 2, function(x) sum(is.na(x))/length(x))
# as a data.frame
train %>%
summarise_all(funs(sum(is.na(train)),
sum(is.na(train))/length(train)))
这是使用各种方法的其他一些示例的不错的compilation。