我有一个数据框,如下所示
85 P 74 P 70 P 35 P 38 P 54
49 P 35 P 30 P 50 P 30 P 30
104 P 69 P 50 P 70 P 70 P 87
*44 P *35 P 8 F 10 F 9 F *37
*53 P 30 P *40 P 30 P *48 P *73
76 P 86 P 69 P 84 P 66 P 79
110 P 65 P 40 P 57 P 57 P 74
我需要删除所有包含以*开头的行,使用R.尝试gsub
和sqldf
,但我无法解决它。
答案 0 :(得分:2)
类似的东西:
df2<-df[!apply(df,1,function(rg){any(grepl("^\\*[a-zA-Z1-9]",rg))}),]
应该有效
答案 1 :(得分:2)
lapply
救援:
dat[-unique(unlist(lapply(dat, grep, pattern="^\\*" ))),]
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
#1 85 P 74 P 70 P 35 P 38 P 54
#2 49 P 35 P 30 P 50 P 30 P 30
#3 104 P 69 P 50 P 70 P 70 P 87
#6 76 P 86 P 69 P 84 P 66 P 79
#7 110 P 65 P 40 P 57 P 57 P 74
替代:
dat[!do.call(mapply, c(any, lapply(dat, grepl, pattern="^\\*" )) ),]
备选方案2:
dat[!rowSums(sapply(dat, grepl, pattern="^\\*" ))>0,]
答案 2 :(得分:2)
另一种选择是
dat[!rowSums(`dim<-`(grepl("^\\*", as.matrix(dat)), dim(dat))),]
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
#1 85 P 74 P 70 P 35 P 38 P 54
#2 49 P 35 P 30 P 50 P 30 P 30
#3 104 P 69 P 50 P 70 P 70 P 87
#6 76 P 86 P 69 P 84 P 66 P 79
#7 110 P 65 P 40 P 57 P 57 P 74
set.seed(435)
dat <- as.data.frame(matrix(sample(1:70, 5e3*5e3, replace=TRUE), ncol=5e3))
set.seed(25)
indx1 <- sample(1:nrow(dat), 50)
indx2 <- sample(1:ncol(dat), 50)
dat[cbind(indx1, indx2)] <- paste0("*", dat[cbind(indx1, indx2)])
f1 <- function() dat[Reduce(intersect, lapply(dat, grep, pattern = "^[^*]")), ]
f2 <- function() dat[-unique(unlist(lapply(dat, grep, pattern="^\\*" ))),]
f3 <- function() dat[!do.call(mapply, c(any, lapply(dat, grepl, pattern="^\\*" )) ),]
f4 <- function() dat[!rowSums(sapply(dat, grepl, pattern="^\\*" ))>0,]
f5 <- function() dat[!rowSums(`dim<-`(grepl("^\\*", as.matrix(dat)), dim(dat))),]
f6 <- function() dat[!apply(dat,1,function(rg){any(grepl("^\\*[a-zA-Z1-9]",rg))}),]
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), f4(), f5(), f6(), unit='relative', times=20L)
#Unit: relative
#expr min lq mean median uq max neval cld
#f1() 1.0000000 1.0000000 1.000000 1.000000 1.0000000 1.000000 20 a
#f2() 1.0027468 0.9161133 1.016727 1.114290 0.9195075 1.349399 20 a
#f3() 3.5439827 3.2813344 3.780002 4.030356 3.5895574 4.209253 20 b
#f4() 3.3107041 3.7476493 4.226460 3.981993 4.0828441 6.023643 20 b
#f5() 0.8852371 0.8952590 0.952933 1.075323 0.9116219 0.881139 20 a
#f6() 0.9693086 0.9810031 1.044375 1.086053 1.0062910 1.189163 20 a
答案 3 :(得分:2)
您可以使用lapply
和Reduce
(以及其他三个功能grep
,intersect
和[
):
dat[Reduce(intersect, lapply(dat, grep, pattern = "^[^*]")), ]
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
# 1 85 P 74 P 70 P 35 P 38 P 54
# 2 49 P 35 P 30 P 50 P 30 P 30
# 3 104 P 69 P 50 P 70 P 70 P 87
# 6 76 P 86 P 69 P 84 P 66 P 79
# 7 110 P 65 P 40 P 57 P 57 P 74
答案 4 :(得分:2)
paste
排在一起,grepl
有星星的那些,并将那些不匹配的那些:
DF[!grepl("*", do.call(paste, DF), fixed = TRUE), ]
,并提供:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
1 85 P 74 P 70 P 35 P 38 P 54
2 49 P 35 P 30 P 50 P 30 P 30
3 104 P 69 P 50 P 70 P 70 P 87
6 76 P 86 P 69 P 84 P 66 P 79
7 110 P 65 P 40 P 57 P 57 P 74