Question

因此R具有一个强大的功能，即match（匹配），可以在向量中查找值（也可以％in％来测试其存在）。但是如果我想在大向量中找到一个短向量怎么办？也就是说，测试给定向量是否包含在另一个向量中（按顺序！）？如果我想查找给定向量是否为另一个向量的前缀/后缀怎么办？ R中有这样的功能吗？

我想要的例子：

x=c(1,3,4)
y=c(4,1,3,4,5)
z=c(3,1)

v_contains(x,y)  # return TRUE x is contained in y
v_contains(z,y)  # FALSE the values of z are in y, but not in the right order
v_match(x,y)     # returns 2 because x appears in y starting at position 2

有没有类似的东西？您将如何有效地处理它？</ p>

Answer 1

一个recent post发现了乔纳森·卡洛尔（Jonathan Carroll）的解决方案。我怀疑R中是否存在更快的解决方案。

v_match <- function(needle, haystack, nomatch = 0L) { 
  sieved <- which(haystack == needle[1L]) 
  for (i in seq.int(1L, length(needle) - 1L)) {
    sieved <- sieved[haystack[sieved + i] == needle[i + 1L]]
  }
  sieved
}

v_contains <- function(needle, haystack) {
  sieved <- which(haystack == needle[1L]) 
  for (i in seq.int(1L, length(needle) - 1L)) {
    sieved <- sieved[haystack[sieved + i] == needle[i + 1L]]
  }
  length(sieved) && !anyNA(sieved)
}

测试和基准测试

library(testthat)
x=c(1,3,4)
y=c(4,1,3,4,5)
z=c(3,1)

expect_true(v_contains(x,y))   # return TRUE x is contained in y
expect_false(v_contains(z,y))  # FALSE the values of z are in y, but not in order
expect_equal(v_match(x,y), 2)  # returns 2 because x appears in y starting at position 2

x <- c(5, 1, 3)
yes <- c(sample(5:1e6), c(5, 1, 3))
no <- c(sample(5:1e6), c(4, 1, 3))
expect_true(v_contains(x, yes))
expect_false(v_contains(x, no))
expect_equal(v_match(x, yes), 1e6 - 3)


v_contains_roll <- function(x, y) {
  any(zoo::rollapply(y, length(x), identical, x))
} 
v_contains_stri <- function(x, y) {
  stringr::str_detect(paste(y, collapse = "_"),
                      paste(x, collapse = "_"))
}

options(digits = 2)
options(scipen = 99)
library(microbenchmark)
gc(0, 1, 1)
#>           used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells  527502   28    1180915   63   527502   28
#> Vcells 3010073   23    8388608   64  3010073   23
microbenchmark(v_contains(x, yes), 
               v_contains(x, no),
               v_contains_stri(x, yes),
               v_contains_stri(x, no),
               v_contains_roll(x, yes),
               v_contains_roll(x, no), 
               times = 2L,
               control = list(order = "block"))
#> Unit: milliseconds
#>                     expr    min     lq   mean median     uq    max neval
#>       v_contains(x, yes)    3.8    3.8    3.8    3.8    3.9    3.9     2
#>        v_contains(x, no)    3.7    3.7    3.7    3.7    3.8    3.8     2
#>  v_contains_stri(x, yes) 1658.4 1658.4 1676.7 1676.7 1695.0 1695.0     2
#>   v_contains_stri(x, no) 1632.3 1632.3 1770.0 1770.0 1907.8 1907.8     2
#>  v_contains_roll(x, yes) 5447.4 5447.4 5666.1 5666.1 5884.7 5884.7     2
#>   v_contains_roll(x, no) 5458.8 5458.8 5521.7 5521.7 5584.6 5584.6     2
#>  cld
#>  a  
#>  a  
#>   b 
#>   b 
#>    c
#>    c

^{由reprex package（v0.2.0）于2018-08-18创建。}

Answer 2

x=c(1,3,4)
y=c(4,1,3,4,5)
z=c(3,1)

# 1. return TRUE x is contained in y

stringr::str_detect(paste(y, collapse = "_"), paste(x, collapse = "_"))

# 2. FALSE the values of z are in y, but not in the right order

all(z %in% y) & stringr::str_detect(paste(y, collapse = "_"), paste(z, collapse = "_"))

# 3. returns 2 because x appears in y starting at position 2

stringr::str_locate(paste(y, collapse = "_"), paste(x, collapse = "_"))[1] - 1

Answer 3

如果x和y与问题相同，那么这里有一些替代方法。

1）滚动应用。此操作检查x中是否包含y。

library(zoo)

any(rollapply(y, length(x), identical, x))
## [1] TRUE

2）嵌入稍微复杂一点，但仍然只有一行，并且没有程序包依赖性。

any(apply(t(embed(y, length(x))) == rev(x), 2, all))
## [1] TRUE

2a）或以下版本：

any(apply(embed(y, length(x)), 1, identical, rev(x)))
## [1] TRUE

3）字符串将x和y都转换为字符串并使用grepl。该问题的注释已经指向这种方法的代码。

4）Rcpp 如果速度很重要，那么我们可以使用C ++对其进行编码。标准库已经将其作为内置函数。将此文件放置在当前目录中的文件Search.cpp中，并从R运行library(Rcpp); sourceCpp("Search.cpp")中。然后，R代码Search(x, y)将调用它。

#include <Rcpp.h>

using namespace Rcpp;
using namespace std;

// [[Rcpp::export]]
bool Search(NumericVector x, NumericVector y) {
  return search(begin(y), end(y), begin(x), end(x)) != end(y);
}

在R中匹配“子向量”

3 个答案: