Question

我有以下data.frame：

dage ded dht dwt marital inc smoke time number
31   5  65 110       1   1     0    0      0
38   5  70 148       1   4     0    0      0
32   1  99 999       1   2     1    1      1
28   4  99 999       1  98     3    4      2
35   4  99 999       1   7     0    0      0
33   4  98 998       1  99     0    0      0

我想删除任何具有数字99或999（或两者都有）的行。

data.frame结构：

df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L, 
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L, 
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L, 
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L, 
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L, 
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")

Answer 1

您可以先用99替换999和NA。

dat[dat == 99 | dat == 999] <- NA

然后使用na.omit或complete.cases。

na.omit(dat)
#   dage ded dht dwt marital inc smoke time number
# 1   31   5  65 110       1   1     0    0      0
# 2   38   5  70 148       1   4     0    0      0

dat[complete.cases(dat), ]
#   dage ded dht dwt marital inc smoke time number
# 1   31   5  65 110       1   1     0    0      0
# 2   38   5  70 148       1   4     0    0      0

数据

dat <- read.table(text = "dage ded dht dwt marital inc smoke time number
31   5  65 110       1   1     0    0      0
38   5  70 148       1   4     0    0      0
32   1  99 999       1   2     1    1      1
28   4  99 999       1  98     3    4      2
35   4  99 999       1   7     0    0      0
33   4  98 998       1  99     0    0      0",
                  header = TRUE)

Answer 2

如果您的数据框称为df1：

require(dplyr)
filter_all(df1, all_vars(.!=99 & .!=999))

结果：

  dage ded dht dwt marital inc smoke time number
1   31   5  65 110       1   1     0    0      0
2   38   5  70 148       1   4     0    0      0

Answer 3

这是使用QString const&和any()的解决方案，不需要任何补充软件包：

apply()

收益：

#fake data
d <- data.frame(a = c(1,2,3,4,99), b = c(99, 1,2,999,4))
#subset rows that don't contain a 99 or 999
d[!apply(d, 1, function(x) any(x %in% c(99,999))),]

Answer 4

使用rowSums

df[rowSums(df[,c('dht','dwt')]==99|df[,c('dht','dwt')]==999)==0,]
  ded dht dwt
1   5  65 110
2   5  70 148
6   4  98 998

Answer 5

按照原始问题所示创建data.frame：

df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L, 
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L, 
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L, 
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L, 
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L, 
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")

data.table解决方案：

library(data.table)
dt <- as.data.table(df)
dt[rowSums(df == 99)==0 & rowSums(df == 999)==0]

base R解决方案：

 df[!apply(df, 1, function(x) any(x %in% c(99,999))),]

dplyr解决方案：

require(dplyr)
filter_all(df, all_vars(.!=99 & .!=999))

基准：

microbenchmark::microbenchmark(dt = dt[rowSums(df == 99)==0 & rowSums(df == 999)==0], 
base = df[!apply(df, 1, function(x) any(x %in% c(99,999))),], 
dplyr = filter_all(df, all_vars(.!=99 & .!=999)), times = 10000)
# Unit: microseconds
  #expr      min       lq      mean    median        uq        max neval
  #dt    588.000  645.801  701.4309  675.6005  723.2515   5203.801 10000
  #base  264.601  296.901  324.2588  314.4005  335.7020   3435.600 10000
  #dplyr 3671.400 3854.301 4036.3976 3915.3010 3983.0010 139226.802 10000

从数据库中的多个变量中删除数据框中的行

5 个答案: