我有以下data.frame
:
dage ded dht dwt marital inc smoke time number
31 5 65 110 1 1 0 0 0
38 5 70 148 1 4 0 0 0
32 1 99 999 1 2 1 1 1
28 4 99 999 1 98 3 4 2
35 4 99 999 1 7 0 0 0
33 4 98 998 1 99 0 0 0
我想删除任何具有数字99
或999
(或两者都有)的行。
data.frame结构:
df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L,
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L,
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L,
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L,
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L,
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")
答案 0 :(得分:1)
您可以先用99
替换999
和NA
。
dat[dat == 99 | dat == 999] <- NA
然后使用na.omit
或complete.cases
。
na.omit(dat)
# dage ded dht dwt marital inc smoke time number
# 1 31 5 65 110 1 1 0 0 0
# 2 38 5 70 148 1 4 0 0 0
dat[complete.cases(dat), ]
# dage ded dht dwt marital inc smoke time number
# 1 31 5 65 110 1 1 0 0 0
# 2 38 5 70 148 1 4 0 0 0
数据
dat <- read.table(text = "dage ded dht dwt marital inc smoke time number
31 5 65 110 1 1 0 0 0
38 5 70 148 1 4 0 0 0
32 1 99 999 1 2 1 1 1
28 4 99 999 1 98 3 4 2
35 4 99 999 1 7 0 0 0
33 4 98 998 1 99 0 0 0",
header = TRUE)
答案 1 :(得分:0)
如果您的数据框称为df1
:
require(dplyr)
filter_all(df1, all_vars(.!=99 & .!=999))
结果:
dage ded dht dwt marital inc smoke time number
1 31 5 65 110 1 1 0 0 0
2 38 5 70 148 1 4 0 0 0
答案 2 :(得分:0)
这是使用QString const&
和any()
的解决方案,不需要任何补充软件包:
apply()
收益:
#fake data
d <- data.frame(a = c(1,2,3,4,99), b = c(99, 1,2,999,4))
#subset rows that don't contain a 99 or 999
d[!apply(d, 1, function(x) any(x %in% c(99,999))),]
答案 3 :(得分:0)
使用rowSums
df[rowSums(df[,c('dht','dwt')]==99|df[,c('dht','dwt')]==999)==0,]
ded dht dwt
1 5 65 110
2 5 70 148
6 4 98 998
答案 4 :(得分:0)
按照原始问题所示创建data.frame:
df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L,
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L,
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L,
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L,
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L,
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")
data.table
解决方案:
library(data.table)
dt <- as.data.table(df)
dt[rowSums(df == 99)==0 & rowSums(df == 999)==0]
base
R解决方案:
df[!apply(df, 1, function(x) any(x %in% c(99,999))),]
dplyr
解决方案:
require(dplyr)
filter_all(df, all_vars(.!=99 & .!=999))
基准:
microbenchmark::microbenchmark(dt = dt[rowSums(df == 99)==0 & rowSums(df == 999)==0],
base = df[!apply(df, 1, function(x) any(x %in% c(99,999))),],
dplyr = filter_all(df, all_vars(.!=99 & .!=999)), times = 10000)
# Unit: microseconds
#expr min lq mean median uq max neval
#dt 588.000 645.801 701.4309 675.6005 723.2515 5203.801 10000
#base 264.601 296.901 324.2588 314.4005 335.7020 3435.600 10000
#dplyr 3671.400 3854.301 4036.3976 3915.3010 3983.0010 139226.802 10000