按行计算给定列中的NA

时间:2018-06-04 12:16:10

标签: r dplyr

我想按行在选定的列中计算NA并将结果保存在新列中。我希望使用mutate()

中的dplyr函数实现此目的

它应该如何运作:

loop for each row i in test{
test$SUM <-sum(is.na(test[i,1:2]))
test$SUM2 <-sum(is.na(test[i,3:4]))
test$SUM3 <-sum(is.na(test[i,5:6]))
}

使用的数据:

test<-data.frame(
BIEZ_01 = c(59000, 61462, NA, 33000, 30840, 36612), 
BIEZ_02 = c(5060, 55401, 33000, 33000, 30840, 28884), 
BIEZ_03 = c(NA, 60783, 20000, 20000, NA, 19248), 
BIEZ_04 = c(22100, 59885, 15000, 15000, 20840, 10000), 
BIEZ_05 = c(NA, 59209, 15000, 15000, 20840, NA), 
BIEZ_06 = c(4400, 6109, NA, 500, 10840, 10000))

3 个答案:

答案 0 :(得分:1)

以下是使用 apply 功能的解决方案:

NA_counts <- apply(test,1,function(x){
  c(SUM1=sum(is.na(x[c(1,2)])),SUM2=sum(is.na(x[c(3,4)])),SUM3=sum(is.na(x[c(5,6)])))
  })
cbind(test,t(NA_counts))

答案 1 :(得分:1)

for (i in seq(1,ncol(test),2)) {
  test[[paste('SUM',(i+1)/2)]] <- rowSums(is.na(test[c(i,i+1)]))
}
#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 SUM 1 SUM 2 SUM 3
# 1   59000    5060      NA   22100      NA    4400     0     1     1
# 2   61462   55401   60783   59885   59209    6109     0     0     0
# 3      NA   33000   20000   15000   15000      NA     1     0     1
# 4   33000   33000   20000   15000   15000     500     0     0     0
# 5   30840   30840      NA   20840   20840   10840     0     1     0
# 6   36612   28884   19248   10000      NA   10000     0     0     1

这有点“整洁”:

library(tidyverse)

split(seq(ncol(test)),((1:ncol(test))-1) %/% 2 + 1) %>%
  imap(~test[.] %>% mutate_at(paste0("SUM",.y),function(x) rowSums(is.na(.)))) %>%
  bind_cols

#   BIEZ_01 BIEZ_02 SUM1 BIEZ_03 BIEZ_04 SUM2 BIEZ_05 BIEZ_06 SUM3
# 1   59000    5060    0      NA   22100    1      NA    4400    1
# 2   61462   55401    0   60783   59885    0   59209    6109    0
# 3      NA   33000    1   20000   15000    0   15000      NA    1
# 4   33000   33000    0   20000   15000    0   15000     500    0
# 5   30840   30840    0      NA   20840    1   20840   10840    0
# 6   36612   28884    0   19248   10000    0      NA   10000    1

这将是非常整洁的版本:

test %>%
  rowid_to_column("rowid") %>%
  gather(,,-1) %>%
  mutate(SUM = ceiling(group_indices(.,key)/2)) %>%
  group_by(rowid,SUM) %>%
  summarize(sum_val = sum(is.na(value))) %>%
  ungroup %>%
  spread(SUM,sum_val,sep="") %>%
  select(-1) %>%
  bind_cols(test,.)

#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 SUM1 SUM2 SUM3
# 1   59000    5060      NA   22100      NA    4400    0    1    1
# 2   61462   55401   60783   59885   59209    6109    0    0    0
# 3      NA   33000   20000   15000   15000      NA    1    0    1
# 4   33000   33000   20000   15000   15000     500    0    0    0
# 5   30840   30840      NA   20840   20840   10840    0    1    0
# 6   36612   28884   19248   10000      NA   10000    0    0    1

我还尝试使用nest将列分组为2,并在嵌套结果上使用map_dfc来改变新列,但是我试图使用{{1}由于reduce参数的非标准评估,nest.key ...这会更短,更易读。

答案 2 :(得分:1)

另一个选择

NA.counts <- sapply(split(seq(ncol(test)), ceiling(seq(ncol(test))/2))
                    , function(x) rowSums(is.na(test[, x])))

如果您想使用tidyverse添加可以执行的列

library(tidyverse)
test %>% 
  cbind(NA.counts = map(seq(ncol(test)) %>% split(ceiling(./2))
                        , ~rowSums(is.na(test[, .]))))


#   BIEZ_01 BIEZ_02 BIEZ_03 BIEZ_04 BIEZ_05 BIEZ_06 NA.counts.1 NA.counts.2 NA.counts.3
# 1   59000    5060      NA   22100      NA    4400           0           1           1
# 2   61462   55401   60783   59885   59209    6109           0           0           0
# 3      NA   33000   20000   15000   15000      NA           1           0           1
# 4   33000   33000   20000   15000   15000     500           0           0           0
# 5   30840   30840      NA   20840   20840   10840           0           1           0
# 6   36612   28884   19248   10000      NA   10000           0           0           1

正如@Moody_Mudskipper指出的那样,如果你想修改数据帧,cbind是不必要的。您可以使用

添加列
test[paste0("SUM",seq(ncol(test)/2))] <- map(seq(ncol(test)) %>% split(ceiling(./2)), 
                                             ~rowSums(is.na(test[.])))