现有功能查找丢失和完整案件的数量

时间:2019-04-16 19:31:37

标签: r dataframe missing-data

问题陈述

现有软件包中是否有一个函数可以相对容易地给出数据帧每一列的缺失元素(即NA)和完整元素的数量?

我能够使用下面提到的方法解决此问题,但是我想知道是否有一个软件包可以为我做到这一点(主要是为了在每次执行数据分析时不重复此代码)。

当前解决方案1 ​​

我可以使用以下编写的函数来查找此信息

# Find number of missing elements
missing.total = function(df){
  sum(is.na(df))
}

# Find number of complete elements
complete.total = function(df){
  sum(!is.na(df))
}

# Find number of complete and missing elements for each variable in a data frame
var.complete.info = function(df){
  result = cbind( apply(df, 2, complete.total),
                  apply(df, 2, missing.total) )

  colnames(result) = c("Complete", "Missing")

  return(result)
}

我们可以在以下示例中应用此功能:

set.seed(1)

n.col = 5
n.row = 100
sample.set = c(0:9, NA)

# Create sample data frame with missing and complete
sample.df = data.frame(replicate(n.col,sample(sample.set,n.row,rep=TRUE)))

> var.complete.info(sample.df)
   Complete Missing
X1       95       5
X2       92       8
X3       89      11
X4       88      12
X5       86      14

当前解决方案2

另一种选择是使用skim()库中的skimr函数,但这会提供过多信息:

library(skimr)

# Using sample.df as defined above in solution 1

# Gives more information than necessary
> skim(sample.df)
Skim summary statistics
 n obs: 100 
 n variables: 5 

── Variable type:integer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 variable missing complete   n mean   sd p0  p25 p50 p75 p100     hist
       X1       5       95 100 4.96 2.79  0 3      5   7    9 ▅▃▃▅▃▂▅▇
       X2       8       92 100 4.75 2.72  0 2.75   5   7    9 ▆▃▅▅▃▆▃▇
       X3      11       89 100 3.57 2.51  0 1      3   5    9 ▇▃▃▃▃▂▂▂
       X4      12       88 100 4.3  2.78  0 2      4   7    9 ▇▃▅▅▅▂▃▆
       X5      14       86 100 4.26 2.9   0 1.25   4   7    9 ▇▂▂▅▅▂▃▅ 

理想的解决方案

我正在寻找一个现有的软件包来为我完成工作,所以我希望找到一个可以使我进行如下编码的软件包:

library(Package_I_am_not_aware_of)

existing.function(data_frame)

2 个答案:

答案 0 :(得分:1)

也许VIM::aggr()适合您。

library(VIM)
summary(aggr(d, plot=FALSE))
# Missings per variable: 
#   Variable Count
# X1     7
# X2     9
# X3    10
# X4    14
# 
# Missings in combinations of variables: 
#   Combinations Count   Percent
# 0:0:0:0     5 16.666667
# 0:0:0:1     3 10.000000
# 0:0:1:0     3 10.000000
# 0:0:1:1     3 10.000000
# 0:1:0:0     4 13.333333
# 0:1:0:1     3 10.000000
# 0:1:1:0     1  3.333333
# 0:1:1:1     1  3.333333
# 1:0:0:0     2  6.666667
# 1:0:0:1     3 10.000000
# 1:0:1:0     1  3.333333
# 1:0:1:1     1  3.333333

要获取顶部,可以提取保存在data.frame中的值。

out <- summary(aggr(d, plot=FALSE))
out$missings
#    Variable Count
# X1       X1     7
# X2       X2     9
# X3       X3    10
# X4       X4    14

数据

d <- structure(list(X1 = c(NA, 2L, 3L, 4L, 5L, 6L, 7L, NA, 9L, 10L, 
11L, 12L, NA, NA, 15L, NA, 17L, NA, 19L, 20L, 21L, 22L, 23L, 
24L, 25L, 26L, 27L, NA, 29L, 30L), X2 = c(31L, 32L, 33L, NA, 
NA, 36L, NA, 38L, 39L, 40L, 41L, NA, 43L, 44L, 45L, 46L, 47L, 
48L, NA, NA, NA, 52L, 53L, 54L, 55L, 56L, NA, 58L, 59L, NA), 
    X3 = c(61L, 62L, 63L, 64L, 65L, 66L, NA, 68L, 69L, 70L, NA, 
    72L, NA, NA, NA, 76L, NA, 78L, NA, 80L, 81L, 82L, 83L, NA, 
    NA, NA, 87L, 88L, 89L, 90L), X4 = c(NA, NA, 93L, 94L, 95L, 
    96L, NA, NA, NA, 100L, NA, NA, 103L, NA, 105L, NA, 107L, 
    108L, 109L, NA, NA, NA, 113L, 114L, NA, NA, 117L, 118L, 119L, 
    120L)), class = "data.frame", row.names = c(NA, -30L))

答案 1 :(得分:1)

使用skimr,我们实际上可以使用skim_with来更改默认的摘要功能:

library(skimr)

funs <- get_skimmers()$numeric
skim_with(integer = list(Complete = funs$complete, Missing = funs$missing), append = FALSE)
skim(sample.df)

输出:

Skim summary statistics
 n obs: 100 
 n variables: 5 

-- Variable type:integer ----------------------------------------------------------------------------------------------------------
 variable Complete Missing
       X1       95       5
       X2       92       8
       X3       89      11
       X4       88      12
       X5       86      14