建立缺失值计数/百分比表

时间:2018-12-23 02:23:02

标签: r dataframe nan missing-data

df示例:

    a    b    c   d   Y
0   NA   NA   8   3   1
1   NA   2    5   0   1
2   1    0    7   NA  0
3   NA   NA   7   1   0
4   6    NA   2   NA  1

我正在尝试为变量(Y是二进制)的缺失值创建数据框:

Variable  <- colnames(df)
x1 <- apply(df,2,function(x) sum(is.na(x))/NROW(x)) #percentage over total
x2 <- apply(df,2,function(x) sum(is.na(x))) #NA count
x3 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=1
x4 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
x5 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=0
x6 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df_nan <- data.frame(Variable,x1,x2,x3,x4,x5,x6)

但是,有没有更清洁的方法来做到这一点?因为每个列的名称都保留为x1x2等,所以我试图找出一种方法来更改这些名称 ,而我正在做数据框(不要再这样做了,因为这个想法是要有一个更干净的代码。

3 个答案:

答案 0 :(得分:1)

这里是gather的一种方法,我们创建了一个新列'Y'到summarise,从'x3'到'x6'的输出

library(tidyverse)
df %>% 
     mutate(new = Y) %>% 
     gather(Variable, val, -new) %>% 
     group_by(Variable) %>% 
     summarise(x1 = mean(is.na(val)),
               x2 = sum(is.na(val)), 
               x3 = mean(is.na(val[new == 1])), 
               x4 = sum(is.na(val[new == 1])),
               x5 = mean(is.na(val[new == 0])), 
               x6 = sum(is.na(val[new == 0])))
# A tibble: 5 x 7
#  Variable    x1    x2    x3    x4    x5    x6
#  <chr>    <dbl> <int> <dbl> <int> <dbl> <int>
#1 a          0.6     3 0.667     2   0.5     1
#2 b          0.6     3 0.667     2   0.5     1
#3 c          0       0 0         0   0       0
#4 d          0.4     2 0.333     1   0.5     1
#5 Y          0       0 0         0   0       0

或使用data.tablemeltdcast

library(data.table)
dM <- melt(setDT(df)[, new := Y], id.var = c('new'))[, value1 := is.na(value)]
dM[, .(x1 = sum(value1), x2 = mean(value1)), variable][dcast(dM,  
     variable ~ new, value.var = 'value1', c(mean, sum)), on = .(variable)]

数据

df <- structure(list(a = c(NA, NA, 1L, NA, 6L), b = c(NA, 2L, 0L, NA, 
 NA), c = c(8L, 5L, 7L, 7L, 2L), d = c(3L, 0L, NA, 1L, NA), Y = c(1L, 
 1L, 0L, 0L, 1L)), class = "data.frame", row.names = c("0", "1", 
  "2", "3", "4"))

答案 1 :(得分:0)

使用收集和汇总可以实现您想要的。我没有完全重现您的输出,因为我发现它有点令人费解。

library(tidyverse)


##Reproducing the data
df <- tibble(a = c(NA, NA, 1, NA, 6),
       b = c(NA, 2, 0, NA, NA),
       c = c(8,5,7,7,2),
       d = c(3,0,NA, 1, NA),
       Y = c(1,1,0,0,1))

##Creating table
my_result <- df %>%
    gather(column,value,-Y) %>%
    select(column, value,Y) %>%
    group_by(column, Y) %>%
    summarise(total_na = sum(is.na(value)),
              total_obs = n()) %>%
    mutate(percent_na = total_na/total_obs)


my_result

答案 2 :(得分:0)

我终于解决了它(此代码不会创建太多变量作为我的问题的代码)。尽管我认为应该有一种更有效的方法,但我认为这是非常有效的:

def negative_converter(x):
    # a somewhat naive implementation
    x = x.replace('$', '')
    if '(' in x:
        x = '-' + x.strip('()')
    return float(x)

df = pd.read_excel('test.xlsx', converters={'Profits_In_Million': negative_converter})
print(df)
#      Profits_In_Million
#    0             1000.0
#    1            -1000.0