Question

我希望按行找到两个模式的第一个实例。具体来说，我希望在c(1,0)的每一行中找到c(1,1)的第一次出现以及data.frame的第一次出现。下面的代码使用嵌套的for-loops执行此操作，但对于大型数据集来说速度非常慢。

有没有办法在R基地有效地做到这一点？这个问题类似：

Finding pattern in a matrix in R

这是我的代码，如果模式在第1列开始并在第2列结束，则返回2，如果模式没有连续出现，则返回0。

n <- 5

my.data <- expand.grid(rep(list(1:0), n))
my.data <- my.data[do.call(order, as.list(my.data)),]
my.data <- my.data[order(nrow(my.data):1),]

first.11 <- rep(0, nrow(my.data))
first.10 <- rep(0, nrow(my.data))

for(i in 1:nrow(my.data)) {
     for(j in 1:(ncol(my.data)-1)) {

     if(first.11[i] == 0 & my.data[i,j] == 1 & my.data[i,(j+1)] == 1) first.11[i] = j+1
     if(first.10[i] == 0 & my.data[i,j] == 1 & my.data[i,(j+1)] == 0) first.10[i] = j+1

     }
}

my.data2 <- data.frame(my.data, first.11, first.10)
my.data2

#   Var1 Var2 Var3 Var4 Var5 first.11 first.10
#1     1    1    1    1    1        2        0
#17    1    1    1    1    0        2        5
#9     1    1    1    0    1        2        4
#25    1    1    1    0    0        2        4
#5     1    1    0    1    1        2        3
#21    1    1    0    1    0        2        3
#13    1    1    0    0    1        2        3
#29    1    1    0    0    0        2        3
#3     1    0    1    1    1        4        2
#19    1    0    1    1    0        4        2
#11    1    0    1    0    1        0        2
#27    1    0    1    0    0        0        2
#7     1    0    0    1    1        5        2
#23    1    0    0    1    0        0        2
#15    1    0    0    0    1        0        2
#31    1    0    0    0    0        0        2
#2     0    1    1    1    1        3        0
#18    0    1    1    1    0        3        5
#10    0    1    1    0    1        3        4
#26    0    1    1    0    0        3        4
#6     0    1    0    1    1        5        3
#22    0    1    0    1    0        0        3
#14    0    1    0    0    1        0        3
#30    0    1    0    0    0        0        3
#4     0    0    1    1    1        4        0
#20    0    0    1    1    0        4        5
#12    0    0    1    0    1        0        4
#28    0    0    1    0    0        0        4
#8     0    0    0    1    1        5        0
#24    0    0    0    1    0        0        5
#16    0    0    0    0    1        0        0
#32    0    0    0    0    0        0        0

Answer 1

也许粘贴然后使用正则表达式？

t(
  sapply(
    # paste all columns
    do.call(paste0, my.data),
    function(i){
      c(first.11 = regexpr("11", i)[1] + 1, 
        first.10 = regexpr("10", i)[1] + 1)
    })
)

修改

ff_regex <- function(x, pat){ pat <- paste(pat,collapse = "") sapply( # paste all columns do.call(paste0, x), function(i){ regexpr(pat, i)[1] + 1 }) } # benchmark #test if results match all(ff(my.data, c(1, 1)) == my.data2$first.11) #[1] TRUE all(ff_regex(my.data, c(1, 1)) == my.data2$first.11) #[1] TRUE library(microbenchmark) microbenchmark( ff(my.data, c(1, 1)), ff_regex(my.data, c(1, 1)), times = 10000 ) # Unit: microseconds # expr min lq mean median uq max neval cld # ff(my.data, c(1, 1)) 836.442 902.013 958.7856 919.687 943.064 43851.35 10000 b # ff_regex(my.data, c(1, 1)) 199.845 218.376 240.5664 226.929 240.043 42231.78 10000 a

Answer 2

另一个想法是检查模式中以前匹配的以下列：

ff = function(x, pat)
{
    nc = ncol(x) - (length(pat) - 1L)
    ans = arrayInd(seq_len(nrow(x) * nc), c(nrow(x), nc))
    for(i in seq_along(pat)) {
        ans = ans[x[ans] == pat[[i]], ]
        ans[, 2L] = ans[, 2L] + 1L
    }
    inds = aggregate(list(ans[, 2L] - 1L), list(ans[, 1L]), min) 
    ret = integer(nrow(x))
    ret[inds[[1L]]] = inds[[2L]]
    ret
}
all.equal(ff(my.data, c(1, 1)), my.data2$first.11)
#[1] TRUE
all.equal(ff(my.data, c(1, 0)), my.data2$first.10)
#[1] TRUE

而且，对于更长的模式：

ff(my.data, c(1, 0, 1, 1))
# [1] 0 0 0 0 5 0 0 0 4 4 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0

按行查找第一次出现的模式

2 个答案: