Question

我希望采用不断增加的数字序列（例如一系列次数）

set.seed(41);  d <- seq(1:100) + runif(100, 0, 1)

如果两个连续数字之间的差异低于阈值，则通过取两者的平均值将它们合并为单个点，然后继续进行直到下一次需要组合。我使用了我通常避免的函数：while和ifelse来编写一个快速而肮脏的函数，它起作用但速度不快。你能否更有效地解决这个任务1）2）无需调用for或while循环。是否有一些内置功能，可能具有更多功能，非常适合这样的任务？

combine_points <- function(x, th=0.5)
{ 
    i = 1                           # start i at 1
    while(min(diff(x)) < th)        # initiate while loop
    {
    ifelse(x[i+1] - x[i] < th,      # logical condition
           x[i] <- x[i+1] <- 
           mean(c(x[i+1], x[i])),      # assignment if TRUE
           (x[i] <- x[i]))          # assignment if FALSE
    x <- sort(unique(x))            # get rid of the duplicated entry created when
                                    # the ifelse statement was TRUE
    # increment i or reset i to 1 if it gets too large
    ifelse(i == length(x), i <- 1, i <- i+1 )
    }
    return(x)
}

newd <- combine_points(d)  
th <- 0.5
which(diff(newd) < th)

integer(0)

到目前为止更新解决方案的基准。

我使用更大的样本向量进行基准测试，当向量变长时，@ Roland建议的Rcpp解决方案比我的第一个while循环慢。我对初始while循环进行了改进，并制作了它的Rcpp版本。基准测试结果如下。请注意@flodel答案不能直接比较，因为它是一种根本不同的组合方法，但它肯定非常快。

set.seed(41);  d <- seq(1:4000) + runif(4000, 0, 1)
library(microbenchmark)
microbenchmark(
     combine_points.Frank(d,th=0.5),
     combine_points.Frank2(d,th=0.5),
     combine_points_Roland(d,th=0.5),
     combine_points_Roland2(d,th=0.5))
Unit: milliseconds
                                expr       min        lq    median        uq        max neval
   combine_points.Frank(d, th = 0.5) 2115.6391 2154.5038 2174.5889 2193.8444  7884.1638   100
  combine_points.Frank2(d, th = 0.5) 1298.2923 1323.2214 1341.5357 1357.4260 15538.0872   100
  combine_points_Roland(d, th = 0.5) 2497.9106 2506.5960 2512.3591 2519.0036  2573.2854   100
 combine_points_Roland2(d, th = 0.5)  494.8406  497.3613  498.2347  499.8777   544.9743   100

这比我的第一次尝试有了相当大的改进，以下是目前为止速度最快的Rcpp版本：

combine_points.Frank2 <- function(x, th=0.5)
{ 
    i = 1
    while(min(diff(x)) < th)
    {
    if(x[i+1] - x[i] >= th){ 
           i <- i + 1}
    else {
           x[i] <- x[i+1] <- 
           mean(c(x[i+1], x[i]));x <- unique(x); i <- i }  
    }
    return(x)
}

Rcpp版

cppFunction('
 NumericVector combine_points_Roland2(NumericVector x, double th) {
  int i=0;
  while(min(diff(x)) < th)   
  {
    if ((x[i+1] - x[i]) >= th)
    {
      i = i + 1;
    }
    else{
      x[i] = (x[i+1] + x[i])/2;
      x[i+1] = x[i];
      x = sort_unique(x);
      i = i;
    }
   }
   return x;
}
')

Answer 1

看看这是否符合您的要求：

combine_points <- function(x, th=0.5) {
  group <- cumsum(c(FALSE, diff(x) > th))
  unname(sapply(split(x, group), mean))
}

combine_points(c(-1, 0.1, 0.2, 0.3, 1, 1.5, 2.0, 2.5, 3.0, 10), 0.5)
# [1] -1.0  0.2  2.0 10.0

Answer 2

这是您的功能到Rcpp的翻译。它使用糖功能，非常方便，但通常有更快的替代品（RcppEigen或RcppArmadillo有利于此）。当然，算法可以改进。

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector combine_points1(NumericVector x, double th) {
  int i=0;
  while(min(diff(x)) < th)   
  {
    if ((x[i+1] - x[i]) < th)
    {
      x[i] = (x[i+1] + x[i])/2;
      x[i+1] = x[i];
    } 
    x = sort_unique(x);
    if(i <= x.size())
    {
      i = i+1;
    }
    else {
      i=1;
    }
  }
   return x;
}

我建议使用RStudio编写Rcpp函数并获取它们。

all.equal(combine_points1(d, 0.5),
          combine_points(d, 0.5))
#[1] TRUE

library(compiler)
combine_points_comp <- cmpfun(combine_points) 

library(microbenchmark)
microbenchmark(combine_points1(d, 0.5),
               combine_points_comp(d, 0.5),
               combine_points(d, 0.5))

# Unit: microseconds
#                        expr      min        lq    median        uq       max neval
#     combine_points1(d, 0.5)  652.772  664.6815  683.1315   714.653  1030.171   100
# combine_points_comp(d, 0.5) 8344.839 8692.0880 9010.1470 10627.049 14117.553   100
#      combine_points(d, 0.5) 8996.768 9371.0805 9687.0235 10560.226 12800.831   100

在没有真正努力的情况下加速14倍。

Answer 3

这是更快的事情。它避免在循环中调整大小/复制x。

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
 NumericVector combine_points_Roland3(NumericVector x, double th) {
  int i=0, j;
  int n(x.size());
  while(i < n-1)   
  {
    if ((x[i+1] - x[i]) >= th)
    {
      i = i + 1;
    }
    else{
      x[i] = (x[i+1] + x[i])/2;
      n = n-1;
      for (j=i+1; j<n; j++) 
      {
        x[j]=x[j+1];
      }      
    }
  }
  NumericVector y(n);
  for (i = 0; i < n; i++) {
    y[i] = x[i];
  }
   return y;
}

相同算法的R实现：

combine_points_Roland3R <- function(x, th) {
  i <- 1
  n <- length(x)

  while(i < n) {
    if ((x[i+1] - x[i]) >= th) {
      i <- i + 1;
    } else {
      x[i] <- (x[i+1] + x[i])/2
      n <- n-1
      x[(i+1):n] <- x[(i+2):(n+1)]
    }
  }
  x[1:n]
}

set.seed(41);  d <- seq(1:4000) + runif(4000, 0, 1)
x2 <- combine_points_Roland2(d, 0.5)
x3 <- combine_points_Roland3(d, 0.5)
all.equal(x2, x3)
#TRUE
x4 <- combine_points_Roland3R(d, 0.5)
all.equal(x2, x4)
#TRUE

基准：

library(microbenchmark)
microbenchmark(combine_points_Roland2(d, 0.5),
               combine_points_Roland3(d, 0.5), 
               combine_points_Roland3R(d, 0.5))

# Unit: microseconds
#                            expr       min         lq      median          uq        max neval
#  combine_points_Roland2(d, 0.5) 126458.64 131414.592 132355.4285 133422.2235 147306.728   100
#  combine_points_Roland3(d, 0.5)    121.34    128.269    140.8955    143.3595    393.582   100
# combine_points_Roland3R(d, 0.5)  17564.24  18626.878  19155.6565  20910.2935  68707.888   100

R：当它们的差异低于某个阈值时的平均顺序值

到目前为止更新解决方案的基准。

3 个答案: