df示例:
a b c d Y
0 NA NA 8 3 1
1 NA 2 5 0 1
2 1 0 7 NA 0
3 NA NA 7 1 0
4 6 NA 2 NA 1
我正在尝试为变量(Y
是二进制)的缺失值创建数据框:
Variable <- colnames(df)
x1 <- apply(df,2,function(x) sum(is.na(x))/NROW(x)) #percentage over total
x2 <- apply(df,2,function(x) sum(is.na(x))) #NA count
x3 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=1
x4 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
x5 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=0
x6 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df_nan <- data.frame(Variable,x1,x2,x3,x4,x5,x6)
但是,有没有更清洁的方法来做到这一点?因为每个列的名称都保留为x1
,x2
等,所以我试图找出一种方法来更改这些名称 ,而我正在做数据框(不要再这样做了,因为这个想法是要有一个更干净的代码。
答案 0 :(得分:1)
这里是gather
的一种方法,我们创建了一个新列'Y'到summarise
,从'x3'到'x6'的输出
library(tidyverse)
df %>%
mutate(new = Y) %>%
gather(Variable, val, -new) %>%
group_by(Variable) %>%
summarise(x1 = mean(is.na(val)),
x2 = sum(is.na(val)),
x3 = mean(is.na(val[new == 1])),
x4 = sum(is.na(val[new == 1])),
x5 = mean(is.na(val[new == 0])),
x6 = sum(is.na(val[new == 0])))
# A tibble: 5 x 7
# Variable x1 x2 x3 x4 x5 x6
# <chr> <dbl> <int> <dbl> <int> <dbl> <int>
#1 a 0.6 3 0.667 2 0.5 1
#2 b 0.6 3 0.667 2 0.5 1
#3 c 0 0 0 0 0 0
#4 d 0.4 2 0.333 1 0.5 1
#5 Y 0 0 0 0 0 0
或使用data.table
到melt
和dcast
library(data.table)
dM <- melt(setDT(df)[, new := Y], id.var = c('new'))[, value1 := is.na(value)]
dM[, .(x1 = sum(value1), x2 = mean(value1)), variable][dcast(dM,
variable ~ new, value.var = 'value1', c(mean, sum)), on = .(variable)]
df <- structure(list(a = c(NA, NA, 1L, NA, 6L), b = c(NA, 2L, 0L, NA,
NA), c = c(8L, 5L, 7L, 7L, 2L), d = c(3L, 0L, NA, 1L, NA), Y = c(1L,
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c("0", "1",
"2", "3", "4"))
答案 1 :(得分:0)
使用收集和汇总可以实现您想要的。我没有完全重现您的输出,因为我发现它有点令人费解。
library(tidyverse)
##Reproducing the data
df <- tibble(a = c(NA, NA, 1, NA, 6),
b = c(NA, 2, 0, NA, NA),
c = c(8,5,7,7,2),
d = c(3,0,NA, 1, NA),
Y = c(1,1,0,0,1))
##Creating table
my_result <- df %>%
gather(column,value,-Y) %>%
select(column, value,Y) %>%
group_by(column, Y) %>%
summarise(total_na = sum(is.na(value)),
total_obs = n()) %>%
mutate(percent_na = total_na/total_obs)
my_result
答案 2 :(得分:0)
我终于解决了它(此代码不会创建太多变量作为我的问题的代码)。尽管我认为应该有一种更有效的方法,但我认为这是非常有效的:
def negative_converter(x):
# a somewhat naive implementation
x = x.replace('$', '')
if '(' in x:
x = '-' + x.strip('()')
return float(x)
df = pd.read_excel('test.xlsx', converters={'Profits_In_Million': negative_converter})
print(df)
# Profits_In_Million
# 0 1000.0
# 1 -1000.0