我有一个涉及循环问题的解决方案,并且有效,但我觉得我遗漏了一些涉及更有效实现的问题。问题:我有一个数字向量序列,并想要识别第一个向量的另一个向量中的起始位置。
它的工作原理如下:
# helper function for matchSequence
# wraps a vector by removing the first n elements and padding end with NAs
wrapVector <- function(x, n) {
stopifnot(n <= length(x))
if (n == length(x))
return(rep(NA, n))
else
return(c(x[(n+1):length(x)], rep(NA, n)))
}
wrapVector(LETTERS[1:5], 1)
## [1] "B" "C" "D" "E" NA
wrapVector(LETTERS[1:5], 2)
## [1] "C" "D" "E" NA NA
# returns the starting index positions of the sequence found in a vector
matchSequence <- function(seq, vec) {
matches <- seq[1] == vec
if (length(seq) == 1) return(which(matches))
for (i in 2:length(seq)) {
matches <- cbind(matches, seq[i] == wrapVector(vec, i - 1))
}
which(rowSums(matches) == i)
}
myVector <- c(3, NA, 1, 2, 4, 1, 1, 2)
matchSequence(1:2, myVector)
## [1] 3 7
matchSequence(c(4, 1, 1), myVector)
## [1] 5
matchSequence(1:3, myVector)
## integer(0)
有没有更好的方法来实施matchSequence()
?
加
“更好”这里可能意味着使用我没有想到的更优雅的方法,但更好的是,意味着更快。尝试比较解决方案:
set.seed(100)
myVector2 <- sample(c(NA, 1:4), size = 1000, replace = TRUE)
matchSequence(c(4, 1, 1), myVector2)
## [1] 12 48 91 120 252 491 499 590 697 771 865
microbenchmark::microbenchmark(matchSequence(c(4, 1, 1), myVector2))
## Unit: microseconds
## expr min lq mean median uq max naval
## matchSequence(c(4, 1, 1), myVector2) 154.346 160.7335 174.4533 166.2635 176.5845 300.453 100
答案 0 :(得分:9)
这是一个有点不同的想法:
f <- function(seq, vec) {
mm <- t(embed(vec, length(seq))) == rev(seq) ## relies on recycling of seq
which(apply(mm, 2, all))
}
myVector <- c(3, NA, 1, 2, 4, 1, 1, 2)
f(1:2, myVector)
# [1] 3 7
f(c(4,1,1), myVector)
# [1] 5
f(1:3, myVector)
# integer(0)
答案 1 :(得分:9)
一个递归的想法(2月5日编辑&#39; 16以模式中的NA
方式工作):
find_pat = function(pat, x)
{
ff = function(.pat, .x, acc = if(length(.pat)) seq_along(.x) else integer(0L)) {
if(!length(.pat)) return(acc)
if(is.na(.pat[[1L]]))
Recall(.pat[-1L], .x, acc[which(is.na(.x[acc]))] + 1L)
else
Recall(.pat[-1L], .x, acc[which(.pat[[1L]] == .x[acc])] + 1L)
}
return(ff(pat, x) - length(pat))
}
find_pat(1:2, myVector)
#[1] 3 7
find_pat(c(4, 1, 1), myVector)
#[1] 5
find_pat(1:3, myVector)
#integer(0)
find_pat(c(NA, 1), myVector)
#[1] 2
find_pat(c(3, NA), myVector)
#[1] 1
在基准测试中:
all.equal(matchSequence(s, my_vec2), find_pat(s, my_vec2))
#[1] TRUE
microbenchmark::microbenchmark(matchSequence(s, my_vec2),
flm(s, my_vec2),
find_pat(s, my_vec2),
unit = "relative")
#Unit: relative
# expr min lq median uq max neval
# matchSequence(s, my_vec2) 2.970888 3.096573 3.068802 3.023167 12.41387 100
# flm(s, my_vec2) 1.140777 1.173043 1.258394 1.280753 12.79848 100
# find_pat(s, my_vec2) 1.000000 1.000000 1.000000 1.000000 1.00000 100
使用更大的数据:
set.seed(911); VEC = sample(c(NA, 1:3), 1e6, TRUE); PAT = c(3, 2, 2, 1, 3, 2, 2, 1, 1, 3)
all.equal(matchSequence(PAT, VEC), find_pat(PAT, VEC))
#[1] TRUE
microbenchmark::microbenchmark(matchSequence(PAT, VEC),
flm(PAT, VEC),
find_pat(PAT, VEC),
unit = "relative", times = 20)
#Unit: relative
# expr min lq median uq max neval
# matchSequence(PAT, VEC) 23.106862 20.54601 19.831344 18.677528 12.563634 20
# flm(PAT, VEC) 2.810611 2.51955 2.963352 2.877195 1.728512 20
# find_pat(PAT, VEC) 1.000000 1.00000 1.000000 1.000000 1.000000 20
答案 2 :(得分:6)
另一个想法:
match_seq2 <- function(s,v){
n = length(s)
nc = length(v)-n+1
which(
n == rowsum(
as.integer(v[ rep(0:(n-1), nc) + rep(1:nc, each=n) ] == s),
rep(seq(nc),each=n)
)
)
}
我尝试了tapply
版本,但速度只有4倍。
第一个想法:
match_seq <- function(s, v) Filter(
function(i) all.equal( s, v[i + seq_along(s) - 1] ),
which( v == s[1] )
)
# examples:
my_vec <- c(3, NA, 1, 2, 4, 1, 1, 2)
match_seq(1:2, my_vec) # 3 7
match_seq(c(4,1,1), my_vec) # 5
match_seq(1:3, my_vec) # integer(0)
我使用all.equal
代替identical
,因为OP希望整数1:2
与数字c(1,2)
匹配。这种方法通过允许匹配my_vec
末尾之外的点(索引时为NA
)来引入另一个案例:
match_seq(c(1,2,NA), my_vec) # 7
OP的基准
# variant on Josh's, suggested by OP:
f2 <- function(seq, vec) {
mm <- t(embed(vec, length(seq))) == rev(seq) ## relies on recycling of seq
which(colSums(mm)==length(seq))
}
my_check <- function(values) {
all(sapply(values[-1], function(x) identical(values[[1]], x)))
}
set.seed(100)
my_vec2 <- sample(c(NA, 1:4), size = 1000, replace = TRUE)
s <- c(4,1,1)
microbenchmark(
op = matchSequence(s, my_vec2),
josh = f(s, my_vec2),
josh2 = f2(s, my_vec2),
frank = match_seq(s, my_vec2),
frank2 = match_seq2(s, my_vec2),
jlh = matchSequence2(s, my_vec2),
tlm = flm(s, my_vec2),
alexis = find_pat(s, my_vec2),
unit = "relative", check=my_check)
结果:
Unit: relative
expr min lq mean median uq max neval
op 3.693609 3.505168 3.222532 3.481452 3.433955 1.9204263 100
josh 15.670380 14.756374 12.617934 14.612219 14.575440 3.1076794 100
josh2 3.115586 2.937810 2.602087 2.903687 2.905654 1.1927951 100
frank 171.824973 157.711299 129.820601 158.304789 155.009037 15.8087792 100
frank2 9.352514 8.769373 7.364126 8.607341 8.415083 1.9386370 100
jlh 215.304342 197.643641 166.450118 196.657527 200.126846 44.1745551 100
tlm 1.277462 1.323832 1.125965 1.333331 1.379717 0.2375295 100
alexis 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
所以alexis_laz赢了!
(请随意更新此内容。请参阅alexis的答案以获取更多基准。)
答案 3 :(得分:6)
我认为另一种尝试再次更快。这要归功于它只检查向量中与搜索序列的开始相匹配的点的匹配。
flm <- function(sq, vec) {
hits <- which(sq[1]==vec)
out <- hits[
colSums(outer(0:(length(sq)-1), hits, function(x,y) vec[x+y]) == sq)==length(sq)
]
out[!is.na(out)]
}
基准测试结果:
#Unit: relative
# expr min lq mean median uq max neval
# josh2 2.469769 2.393794 2.181521 2.353438 2.345911 1.51641 100
# lm 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 100
答案 4 :(得分:4)
这是另一种方式:
myVector <- c(3, NA, 1, 2, 4, 1, 1, 2)
matchSequence <- function(seq,vec) {
n.vec <- length(vec)
n.seq <- length(seq)
which(sapply(1:(n.vec-n.seq+1),function(i)all(head(vec[i:n.vec],n.seq)==seq)))
}
matchSequence(1:2,myVector)
# [1] 3 7
matchSequence(c(4,1,1),myVector)
# [1] 5
matchSequence(1:3,myVector)
# integer(0)