Question

我为此感到挣扎。我需要在火车数据库中的列上创建NA计数和百分比的表（或数据框）。我正在尝试处理此循环：

df <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), c("Variable_name", "%NA is_pass=1", "%NA is_pass=0"))

for (i in names(train)){
    v <- c(
        i,
        round(sum(train[,c(i,"is_pass")]$is_pass == 1 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 1)*100,2),
        round(sum(train[,c(i,"is_pass")]$is_pass == 0 & is.na(train[,c(i,"is_pass")]$i))/sum(train[,c(i,"is_pass")]$is_pass == 0)*100,2)
    )
    df <- rbind(df,v)
}

但是，我仍然遇到此错误（对于所有变量，为简化起见，我只针对前两个变量：

Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_id"):
“invalid factor level, NA generated”
Warning message in `[<-.factor`(`*tmp*`, ri, value = "program_type"):
“invalid factor level, NA generated”

查看数据：

> head(train)
id        program_id program_type program_duration test_id test_type
1 9389_150  Y_1        Y            136              150     offline  
2 16523_44  T_1        T            131               44     offline  
3 13987_178 Z_2        Z            120              178     online   
4 13158_32  T_2        T            117               32     offline  
5 10591_84  V_3        V            131               84     offline  
6 12531_23  T_3        T            134               23     offline  

  difficulty_level trainee_id gender education           city_tier age
1 intermediate      9389      M      Matriculation       3         24 
2 easy             16523      F      High School Diploma 4         26 
3 easy             13987      M      Matriculation       1         40 
4 easy             13158      F      Matriculation       3         NA 
5 intermediate     10591      F      High School Diploma 1         42 
6 intermediate     12531      F      High School Diploma 1         29 

  total_programs_enrolled is_handicapped trainee_engagement_rating is_pass
1 5                       N              1                         0      
2 2                       N              3                         1      
3 1                       N              2                         1      
4 4                       N              1                         1      
5 2                       N              4                         1      
6 4                       N              2                         0

样本数据：

structure(list(program_id = structure(c(16L, 3L, 21L, 4L, 11L, 
5L), .Label = c("S_1", "S_2", "T_1", "T_2", "T_3", "T_4", "U_1", 
"U_2", "V_1", "V_2", "V_3", "V_4", "X_1", "X_2", "X_3", "Y_1", 
"Y_2", "Y_3", "Y_4", "Z_1", "Z_2", "Z_3"), class = "factor"), 
    program_type = structure(c(6L, 2L, 7L, 2L, 4L, 2L), .Label = c("S", 
    "T", "U", "V", "X", "Y", "Z"), class = "factor"), program_duration = c(136L, 
    131L, 120L, 117L, 131L, 134L), test_id = c(150L, 44L, 178L, 
    32L, 84L, 23L), test_type = structure(c(1L, 1L, 2L, 1L, 1L, 
    1L), .Label = c("offline", "online"), class = "factor"), 
    difficulty_level = structure(c(3L, 1L, 1L, 1L, 3L, 3L), .Label = c("easy", 
    "hard", "intermediate", "vary hard"), class = "factor"), 
    trainee_id = c(9389L, 16523L, 13987L, 13158L, 10591L, 12531L
    ), gender = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("F", 
    "M"), class = "factor"), education = structure(c(4L, 2L, 
    4L, 4L, 2L, 2L), .Label = c("Bachelors", "High School Diploma", 
    "Masters", "Matriculation", "No Qualification"), class = "factor"), 
    city_tier = c(3L, 4L, 1L, 3L, 1L, 1L), age = c(24L, 26L, 
    40L, NA, 42L, 29L), total_programs_enrolled = c(5L, 2L, 1L, 
    4L, 2L, 4L), is_handicapped = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("N", "Y"), class = "factor"), trainee_engagement_rating = c(1L, 
    3L, 2L, 1L, 4L, 2L), is_pass = c(0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("program_id", 
"program_type", "program_duration", "test_id", "test_type", "difficulty_level", 
"trainee_id", "gender", "education", "city_tier", "age", "total_programs_enrolled", 
"is_handicapped", "trainee_engagement_rating", "is_pass"), row.names = c(NA, 
6L), class = "data.frame")

Answer 1

这是一种方法，不确定我是否掌握所有数学知识。

sapply(names(train), function(nm) {
  ones <- (train$is_pass == 1)
  zeroes <- (train$is_pass == 0)
  nas <- is.na(train[[nm]])
  c(ones = sum(nas & ones) / sum(ones),
    zeroes = sum(nas & zeroes) / sum(zeroes))
})
#        program_id program_type program_duration test_id test_type difficulty_level trainee_id
# ones            0            0                0       0         0                0          0
# zeroes          0            0                0       0         0                0          0
#        gender education city_tier  age total_programs_enrolled is_handicapped
# ones        0         0         0 0.25                       0              0
# zeroes      0         0         0 0.00                       0              0
#        trainee_engagement_rating is_pass
# ones                           0       0
# zeroes                         0       0

如果您希望使用data.frame而不是上面的matrix（带有行/列名称）的输出，请尝试：

do.call(rbind, lapply(names(train), function(nm) {
  ones <- (train$is_pass == 1)
  zeroes <- (train$is_pass == 0)
  nas <- is.na(train[[nm]])
  data.frame(nm = nm,
             ones = sum(nas & ones) / sum(ones),
             zeroes = sum(nas & zeroes) / sum(zeroes))
}))
#                           nm ones zeroes
# 1                 program_id 0.00      0
# 2               program_type 0.00      0
# 3           program_duration 0.00      0
# 4                    test_id 0.00      0
# 5                  test_type 0.00      0
# 6           difficulty_level 0.00      0
# 7                 trainee_id 0.00      0
# 8                     gender 0.00      0
# 9                  education 0.00      0
# 10                 city_tier 0.00      0
# 11                       age 0.25      0
# 12   total_programs_enrolled 0.00      0
# 13            is_handicapped 0.00      0
# 14 trainee_engagement_rating 0.00      0
# 15                   is_pass 0.00      0

数据：

train <- structure(list(program_id = structure(c(16L, 3L, 21L, 4L, 11L, 
5L), .Label = c("S_1", "S_2", "T_1", "T_2", "T_3", "T_4", "U_1", 
"U_2", "V_1", "V_2", "V_3", "V_4", "X_1", "X_2", "X_3", "Y_1", 
"Y_2", "Y_3", "Y_4", "Z_1", "Z_2", "Z_3"), class = "factor"), 
    program_type = structure(c(6L, 2L, 7L, 2L, 4L, 2L), .Label = c("S", 
    "T", "U", "V", "X", "Y", "Z"), class = "factor"), program_duration = c(136L, 
    131L, 120L, 117L, 131L, 134L), test_id = c(150L, 44L, 178L, 
    32L, 84L, 23L), test_type = structure(c(1L, 1L, 2L, 1L, 1L, 
    1L), .Label = c("offline", "online"), class = "factor"), 
    difficulty_level = structure(c(3L, 1L, 1L, 1L, 3L, 3L), .Label = c("easy", 
    "hard", "intermediate", "vary hard"), class = "factor"), 
    trainee_id = c(9389L, 16523L, 13987L, 13158L, 10591L, 12531L
    ), gender = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("F", 
    "M"), class = "factor"), education = structure(c(4L, 2L, 
    4L, 4L, 2L, 2L), .Label = c("Bachelors", "High School Diploma", 
    "Masters", "Matriculation", "No Qualification"), class = "factor"), 
    city_tier = c(3L, 4L, 1L, 3L, 1L, 1L), age = c(24L, 26L, 
    40L, NA, 42L, 29L), total_programs_enrolled = c(5L, 2L, 1L, 
    4L, 2L, 4L), is_handicapped = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("N", "Y"), class = "factor"), trainee_engagement_rating = c(1L, 
    3L, 2L, 1L, 4L, 2L), is_pass = c(0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("program_id", 
"program_type", "program_duration", "test_id", "test_type", "difficulty_level", 
"trainee_id", "gender", "education", "city_tier", "age", "total_programs_enrolled", 
"is_handicapped", "trainee_engagement_rating", "is_pass"), row.names = c(NA, 
6L), class = "data.frame")

Answer 2

我们可以修复此循环，但这实际上是不必要且缓慢的方法。以下是一些替代方法：

# count NA
apply(train, 2, function(x) sum(is.na(x)))

# percent NA
apply(train, 2, function(x) sum(is.na(x))/length(x))

# as a data.frame
train %>%
  summarise_all(funs(sum(is.na(train)),
                sum(is.na(train))/length(train)))

这是使用各种方法的其他一些示例的不错的compilation。

在循环中向数据帧添加列表/向量

2 个答案: