我一直在努力解决这个问题:给定两个向量,每个向量包含可能重复的元素,如何测试另一个是否完全包含在另一个中? %in%
不考虑重复。我想不出一个优雅的解决方案,它不依赖apply
家族的某些东西。
x <- c(1, 2, 2, 2)
values <- c(1, 1, 1, 2, 2, 3, 4, 5, 6)
# returns TRUE, but x[x == 2] is greater than values[values == 2]
all(x %in% values)
# inelegant solution
"%contains%" <-
function(values, x){
n <- intersect(x, values)
all( sapply(n, function(i) sum(values == i) >= sum(x == i)) )
}
# which yields the following:
> values %contains% x
[1] FALSE
> values <- c(values, 2)
> values %contains% x
[2] TRUE
基准更新
除了Marat提供的答案之外,我可能还找到了另一种解决方案
# values and x must all be non-negative - can change the -1 below accordingly
"%contains%" <-
function(values, x){
t <- Reduce(function(.x, .values) .values[-which.max(.values == .x)]
, x = x
, init = c(-1, values))
t[1] == -1
}
对目前为止的所有答案进行基准测试,包括使用大小的x来修改marat的邮件
library(microbenchmark)
set.seed(31415)
values <- sample(c(0:100), size = 100000, replace = TRUE)
set.seed(11235)
x_lrg <- sample(c(0:100), size = 1000, replace = TRUE)
x_sml <- c(1, 2, 2, 2)
lapply(list(x_sml, x_lrg), function(x){
microbenchmark( hoho_sapply(values, x)
, marat_table(values, x)
, marat_tlm(values, x)
, hoho_reduce(values, x)
, unit = "relative")
})
# Small x
# [[1]]
# Unit: relative
# expr min lq mean median uq max neval
# hoho_sapply(values, x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# marat_table(values, x) 12.718392 10.966770 7.487895 9.260099 8.648351 1.819833 100
# marat_tlm(values, x) 1.354452 1.181094 1.026373 1.088879 1.266939 1.029560 100
# hoho_reduce(values, x) 2.951577 2.748087 2.069830 2.487790 2.216625 1.097648 100
#
# Large x
# [[2]]
# Unit: relative
# expr min lq mean median uq max neval
# hoho_sapply(values, x) 1.158303 1.172352 1.101410 1.177746 1.096661 0.6940260 100
# marat_table(values, x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
# marat_tlm(values, x) 1.099669 1.059256 1.102543 1.071960 1.072881 0.9857229 100
# hoho_reduce(values, x) 85.666549 81.391495 69.089366 74.173366 66.943621 27.9766047 100
答案 0 :(得分:7)
尝试使用table
,例如:
"%contain%" <- function(values,x) {
tx <- table(x)
tv <- table(values)
z <- tv[names(tx)] - tx
all(z >= 0 & !is.na(z))
}
一些例子:
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2,2)
[1] FALSE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6, 2) %contain% c(1,2,2,2)
[1] TRUE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2)
[1] TRUE
> c(1, 1, 1, 2, 2, 3, 4, 5, 6) %contain% c(1,2,2,7)
[1] FALSE