是否有现有的便利功能可以过滤data.table
中的行,给定搜索模式,查看所有列?
names(DT)
[1] "Name" "LongName" "SomeOtherCol" "NumericCol" "bar" "foo"
像这样的东西,适用于任意数量的列:
DT[Name %like% pattern | LongName %like% pattern | SomeOtherCol %like% pattern | bar %like% pattern | foo %like% pattern]
答案 0 :(得分:5)
一种方法是循环遍历列,应用正则表达式,然后返回逻辑数据表。您可以使用rowSums
来获取行。
dt <- data.table(a=c("Aa1","bb","1c"),b=c("A1","a1","1C"), c=letters[1:3])
# "a1" is the pattern to search for
ldt <- dt[, lapply(.SD, function(x) grepl("a1", x, perl=TRUE))]
dt[rowSums(ldt)>0]
# a b c
# 1: Aa1 A1 a
# 2: bb a1 b
答案 1 :(得分:2)
我不认为这是最好的方法。但它的目的是:
> dt <- data.table(a=c("a1","bb","1c"),b=c("A1","BB","1C"))
> dt
a b
1: a1 A1
2: bb BB
3: 1c 1C
> combined <- apply(dt,1,function(r) paste(r,collapse="/%/"))
> combined
[1] "a1/%/A1" "bb/%/BB" "1c/%/1C"
> grepped <- grepl("[a-z][0-9]",apply(dt,1,function(r) paste(r,collapse="/")))
> grepped
[1] TRUE FALSE FALSE
> dt[grepped,]
a b
1: a1 A1
&#34; /%/&#34;必须是与模式无关的东西,并可靠地分隔列。
当然,这些步骤可以组合成一个表达式。
答案 2 :(得分:2)
解决方案3:
首先构建附加所有列的逻辑grep
expression
。然后eval
一次性表达整个表达式:
dt <- data.table(a=c("a1","bb","1c"),b=c("A1","BB","1C"))
search.data.table <- function(x, pattern) {
nms <- names(x)
string <- eval(expression(paste0("grepl('",
pattern,
"', ",
nms,",
ignore.case=TRUE, perl=FALSE)",
collapse = " | ")))
x[eval(as.call(parse(text=string))[[1]])]
}
search.data.table(dt, "a1")
# a b c
# 1: Aa1 A1 a
# 2: bb a1 b
<强>基准强>
# functions
Raffael <- function(x, pattern) {
# unfortunately this implementation throws an error so I can't run the benchmark test.
# Any help?
combined <- apply(x,1,function(r) paste(r,collapse="/%/"))
grepped <- grepl(pattern,apply(x,1,function(r) paste(r,collapse="/")))
x[grepped,]
}
Arun <- function(x, pattern) {
ldt <- x[, lapply(.SD, function(x) grepl(pattern, x, perl=TRUE, ignore.case=TRUE))]
x[rowSums(ldt)>0]
}
DanielKrizian <- function(x, pattern) {
nms <- names(x)
string <- eval(expression(paste0("grepl('", pattern, "', ",nms,", ignore.case=TRUE, perl=FALSE)",collapse = " | ")))
x[eval(as.call(parse(text=string))[[1]])]
}
# generate 1000 x 1000 benchmark data.table
require(data.table)
expr <- quote(paste0(sample(c(LETTERS,tolower(LETTERS),0:9),12, replace=T)
,collapse=""))
set.seed(1)
BIGISH <- data.table(matrix(replicate(1000*1000,eval(expr)),nrow = 1000))
object.size(BIGISH) # 68520912 bytes
# test
benchmark(
DK <- DanielKrizian(BIGISH,"qx"),
A <- Arun(BIGISH,"qx"),
replications=100)
<强>结果
test replications elapsed relative user.self sys.self user.child sys.child
2 A <- Arun(BIGISH, "qx") 100 57.72 1.000 51.95 0.44 NA NA
1 DK <- DanielKrizian(BIGISH, "qx") 100 59.28 1.027 53.72 0.50 NA NA
identical(DK,A)
[1] TRUE