解释

Question

我有一个长矢量，我需要根据阈值将其分成段。段是超过阈值的连续值。当值低于阈值时，段结束，下一段开始，其中值再次超过阈值。我需要记录每个段的开始和结束索引。

以下是效率低下的实施方案。写这个最快最合适的方法是什么？这非常难看，我必须假设这是一个更清洁的实现。

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0
segments <- list()
in.segment <- FALSE
for(i in 1:length(test.vec)){

    # If we're in a segment
    if(in.segment){
        if(test.vec[i] > threshold){
            next
        }else{
            end.ind <- i - 1
            in.segment <- FALSE
            segments[[length(segments) + 1]] <- c(start.ind, end.ind)
        }
    }

    # if not in segment
    else{
        if(test.vec[i] > threshold){        
            start.ind <- i
            in.segment <- TRUE
        }
    }
}

编辑：所有解决方案的运行时

感谢所有回复，这非常有帮助且非常有启发性。下面是对所有五种解决方案的小测试（提供的四个加上原始示例）。正如您所看到的，这四个都是对原始解决方案的巨大改进，但Khashaa的解决方案是目前最快的解决方案。

set.seed(1)
test.vec <- rnorm(1e6, 8, 10);threshold <- 0

originalFunction <- function(x, threshold){
    segments <- list()
    in.segment <- FALSE
    for(i in 1:length(test.vec)){

    # If we're in a segment
        if(in.segment){
            if(test.vec[i] > threshold){
                next
            }else{
                end.ind <- i - 1
                in.segment <- FALSE
                segments[[length(segments) + 1]] <- c(start.ind, end.ind)
            }
        }

    # if not in segment
        else{
            if(test.vec[i] > threshold){        
                start.ind <- i
                in.segment <- TRUE
            }
        }
    }
    segments
}

SimonG <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)
}

Rcpp::cppFunction('DataFrame Khashaa(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')

bgoldst <- function(x, threshold){
    with(rle(x>threshold),
         t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,])   
}

ClausWilke <- function(x, threshold){
    suppressMessages(require(dplyr, quietly = TRUE))
    in.segment <- (x > threshold)
    start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
    end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
    data.frame(start, end)    
}

system.time({ originalFunction(test.vec, threshold); })
 ## user  system elapsed 
 ## 66.539   1.232  67.770 
system.time({ SimonG(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.028   0.008   0.036 
system.time({ Khashaa(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.008   0.000   0.008 
system.time({ bgoldst(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.065   0.000   0.065 
system.time({ ClausWilke(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.274   0.012   0.285

Answer 1

这是另一种选择，主要使用which。通过查找hit序列的非连续元素来确定起点和终点。

test.vec <- rnorm(100, 8, 10)
threshold <- 0


findSegments <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)

}

findSegments(test.vec, threshold=0)

这就是：

> findSegments(test.vec, threshold=0)
      starts ends
 [1,]      1    3
 [2,]      5    7
 [3,]      9   11
 [4,]     13   28
 [5,]     30   30
 [6,]     32   32
 [7,]     34   36
 [8,]     38   39
 [9,]     41   41
[10,]     43   43
[11,]     46   51
[12,]     54   54
[13,]     56   61
[14,]     63   67
[15,]     69   72
[16,]     76   77
[17,]     80   81
[18,]     83   84
[19,]     86   88
[20,]     90   92
[21,]     94   95
[22,]     97   97
[23,]    100  100

将其与原始序列进行比较：

> round(test.vec,1)
  [1]  20.7  15.7   4.3 -15.1  24.6   9.4  23.2  -4.5  16.9  20.9  13.2  -1.2
 [13]  22.6   7.7   6.0   6.6   4.1  21.3   5.3  16.7  11.4  16.7  19.6  16.7
 [25]  11.6   7.3   3.7   8.4  -4.5  11.7  -7.1   8.4 -18.5  12.8  22.5  11.0
 [37]  -3.3  11.1   6.9  -7.9  22.9  -3.7   3.5  -7.1  -5.9   3.5  13.2  20.0
 [49]  13.2  23.4  15.9  -5.0  -6.3  10.0  -6.2   4.7   2.1  26.4   5.9  27.3
 [61]  14.3 -12.4  28.4  30.9  18.2  11.4   5.7  -4.5   6.2  12.0  10.9  11.1
 [73]  -2.0  -9.0  -1.4  15.4  19.1  -1.6  -5.4   5.4   7.8  -5.6  15.2  13.8
 [85] -18.8   7.1  17.1   9.3  -3.9  22.6   1.7  28.9 -21.3  21.2   8.2 -15.4
 [97]   3.2 -10.2  -6.2  14.1

Answer 2

我喜欢 for loops 以便转换为 Rcpp 很简单。

＆＃xA;＆＃xA;

  Rcpp :: cppFunction （'DataFrame findSegment（NumericVector x，double threshold）{＆＃xA; x.push_back（-1）;＆＃xA; int n = x.size（），startind，endind;＆＃xA; std :: vector＆lt; int＆gt; startinds，endinds;＆＃xA; bool insegment = false;＆＃xA; for（int i = 0; i＆lt; n; i ++）{＆＃xA; if（！insegment）{＆＃xA; if（x [i]＆gt; threshold）{＆＃xA; startind = i + 1;＆＃xA; insegment = true;}＆＃xA;} else {＆＃xA; if（x [i]＆lt; threshold）{＆ #xA; endind = i;＆＃xA; insegment = false;＆＃xA; startinds.push_back（startind）;＆＃xA; endinds.push_back（endind）;＆＃xA;}＆＃xA;}＆＃xA ;}＆＃xA;返回DataFrame :: create（_ [“start”] = startinds，_ [“end”] = endinds）;＆＃xA;}'）＆＃xA; set.seed（1）; test.vec＆lt;  -  rnorm（1e7,8,10）; threshold＆lt;  -  0;＆＃xA; system.time（findSegment（test.vec，threshold））＆＃xA;＆＃xA; #user system逝去＆＃xA;＃0.045 0.000 0.045＆＃xA;＆＃xA ;＃@ SimonG的解决方案＆＃xA; system.time（findSegments（test.vec，threshold））＆＃xA; #user system逝去＆＃xA;＃0.533 0.012 0.548＆＃xA;

＆ #xA;

Answer 3

with(rle(test.vec>threshold),t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,]);
##       [,1] [,2]
##  [1,]    1    8
##  [2,]   10   13
##  [3,]   16   17
##  [4,]   20   26
##  [5,]   28   28
##  [6,]   30   34
##  [7,]   36   38
##  [8,]   41   46
##  [9,]   48   49
## [10,]   51   53
## [11,]   55   81
## [12,]   84   90
## [13,]   92  100

解释

test.vec>threshold
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

使用矢量化比较计算输入向量中的哪些元素高于阈值。

rle(...)
## Run Length Encoding
##   lengths: int [1:25] 8 1 4 2 2 2 7 1 1 1 ...
##   values : logi [1:25] TRUE FALSE TRUE FALSE TRUE FALSE ...

计算逻辑向量的行程编码。这将返回一个列为'rle'的列表，其中包含两个命名组件：lengths，包含每个游程的长度; values，包含运行该长度的值，其中包含case将是TRUE或FALSE，前者代表感兴趣的片段，后者代表非片段游程长度。

with(...,...)

第一个参数是如上所述的行程编码。这将评估由'rle' - 分类列表组成的虚拟环境中的第二个参数，从而使lengths和values组件可作为词汇变量访问。

下面我将深入探讨第二个论点的内容。

cumsum(lengths)
##  [1]   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

计算lengths的累积总和。这将构成计算每个游程长度的起始索引和结束索引的基础。关键点：cumsum的每个元素代表该游程长度的结束索引。

rep(...,2L)
##  [1]   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

复制累计金额。第一次重复将作为开始索引的基础，第二次重复将作为开始索引的基础。我将在此后将这些重复称为＆＃34; start-index重复＆＃34;和＃34;结束指数重复＆＃34;。

c(0L,...[-length(lengths)])
##  [1]   0   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91   8   9  13  15  17  19  26  27  28  29  34  35  38  40  46  47  49  50  53  54  81  83  90  91 100

这将删除start-index重复结束时的最后一个元素，并将零添加到它的开头。这有效地延迟了一个元素的起始索引重复。这是必要的，因为我们需要通过在之前的运行长度的结束索引中加1来计算每个起始索引，将0作为之前不存在的游程长度的结束索引。第一个。

matrix(...,2L,byrow=T)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## [1,]    0    8    9   13   15   17   19   26   27    28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91
## [2,]    8    9   13   15   17   19   26   27   28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91   100

这构建了前一个结果中的两行矩阵。滞后的起始索引重复是顶行，结束索引重复是底行。

...+1:0
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## [1,]    1    9   10   14   16   18   20   27   28    29    30    35    36    39    41    47    48    50    51    54    55    82    84    91    92
## [2,]    8    9   13   15   17   19   26   27   28    29    34    35    38    40    46    47    49    50    53    54    81    83    90    91   100

R首先跨行循环这两个元素的加数，然后跨列循环，因此这会在顶行添加一个。这样就完成了起始索引的计算。

t(...)
##       [,1] [,2]
##  [1,]    1    8
##  [2,]    9    9
##  [3,]   10   13
##  [4,]   14   15
##  [5,]   16   17
##  [6,]   18   19
##  [7,]   20   26
##  [8,]   27   27
##  [9,]   28   28
## [10,]   29   29
## [11,]   30   34
## [12,]   35   35
## [13,]   36   38
## [14,]   39   40
## [15,]   41   46
## [16,]   47   47
## [17,]   48   49
## [18,]   50   50
## [19,]   51   53
## [20,]   54   54
## [21,]   55   81
## [22,]   82   83
## [23,]   84   90
## [24,]   91   91
## [25,]   92  100

转置为双列矩阵。如果您将结果作为双行矩阵，那么这不是完全必要的。

...[values,]
##       [,1] [,2]
##  [1,]    1    8
##  [2,]   10   13
##  [3,]   16   17
##  [4,]   20   26
##  [5,]   28   28
##  [6,]   30   34
##  [7,]   36   38
##  [8,]   41   46
##  [9,]   48   49
## [10,]   51   53
## [11,]   55   81
## [12,]   84   90
## [13,]   92  100

子集只是感兴趣的部分。由于values是表示哪些运行长度超过阈值的逻辑向量，因此我们可以直接将其用作行索引向量。

性能

我想我在这里搞砸了自己，但SimonG的解决方案的表现差不多是我的两倍：

bgoldst <- function() with(rle(test.vec>threshold),t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,]);
simong <- function() findSegments(test.vec,threshold);
set.seed(1); test.vec <- rnorm(1e7,8,10); threshold <- 0;
identical(bgoldst(),unname(simong()));
## [1] TRUE
system.time({ bgoldst(); })
##    user  system elapsed
##   1.344   0.204   1.551
system.time({ simong(); })
##    user  system elapsed
##   0.656   0.109   0.762

+1来自我......

Answer 4

这是我认为更简单的另一种解决方案。请注意，您必须使用set.seed(10)而不是set.seed <- 10来设置随机数生成器的种子。

require(dplyr) # for lead() and lag()

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0

in.segment <- (test.vec > threshold)
start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
segments <- data.frame(start, end)

head(segments)
##   start end
## 1     1   2
## 2     4   6
## 3     8   8
## 4    10  16
## 5    18  21
## 6    23  23

一般来说，在R中，如果你发现自己编写了复杂的循环，而且如果你发表声明，你可能会做错了。大多数问题都可以用矢量化的形式解决。

根据R

4 个答案:

解释

性能