Question

我问了一个相关的问题here但是我意识到我在计算这个复杂的测量时花费了太多时间（目标是使用随机化测试，因此速度是一个问题）。所以我决定抛弃加权，只使用两个测量之间的最小距离。所以这里我有2个向量（在数据框中用于演示目的，但实际上它们是两个向量。

       x     y
1  FALSE  TRUE
2  FALSE FALSE
3   TRUE FALSE
4  FALSE FALSE
5  FALSE  TRUE
6  FALSE FALSE
7  FALSE FALSE
8   TRUE FALSE
9  FALSE  TRUE
10  TRUE  TRUE
11 FALSE FALSE
12 FALSE FALSE
13 FALSE FALSE
14 FALSE  TRUE
15  TRUE FALSE
16 FALSE FALSE
17  TRUE  TRUE
18 FALSE  TRUE
19 FALSE FALSE
20 FALSE  TRUE
21 FALSE FALSE
22 FALSE FALSE
23 FALSE FALSE
24 FALSE FALSE
25  TRUE FALSE

这里我有一些代码可以找到最小距离，但我需要更快的速度（删除不必要的调用和更好的矢量化）。也许我不能在基地R中走得更快。

## MWE EXAMPLE: THE DATA
x <- y <- rep(FALSE, 25)
x[c(3, 8, 10, 15, 17, 25)] <- TRUE
y[c(1, 5, 9, 10, 14, 17, 18, 20)] <- TRUE

## Code to Find Distances
xw <- which(x)
yw <- which(y)

min_dist <- function(xw, yw) {
    unlist(lapply(xw, function(x) {
        min(abs(x - yw))
    }))
}

min_dist(xw, yw)

有没有办法提高基础R的性能？使用dplyr或data.table？

我的矢量更长（10,000 +元素）。

编辑每个flodel的长椅。 flodel有一个我在MWE中预料到的问题，我也不确定如何修复它。如果任何x位置小于最小y位置，则会出现问题。

x <- y <- rep(FALSE, 25)
x[c(3, 8, 9, 15, 17, 25)] <- TRUE
y[c(5, 9, 10, 13, 15, 17, 19)] <- TRUE


xw <- which(x)
yw <- which(y)

flodel <- function(xw, yw) {
   i <- findInterval(xw, yw)
   pmin(xw - yw[i], yw[i+1L] - xw, na.rm = TRUE)
}

flodel(xw, yw)

## [1] -2 -1 -6 -2 -2 20
## Warning message:
## In xw - yw[i] :
##   longer object length is not a multiple of shorter object length

Answer 1

flodel <- function(x, y) {
  xw <- which(x)
  yw <- which(y)
  i <- findInterval(xw, yw, all.inside = TRUE)
  pmin(abs(xw - yw[i]), abs(xw - yw[i+1L]), na.rm = TRUE)
}

GG1 <- function(x, y) {
  require(zoo)
  yy <- ifelse(y, TRUE, NA) * seq_along(y)
  fwd <- na.locf(yy, fromLast = FALSE)[x]
  bck <- na.locf(yy, fromLast = TRUE)[x]
  wx <- which(x)
  pmin(wx - fwd, bck - wx, na.rm = TRUE)
}

GG2 <- function(x, y) {
  require(data.table)
  dtx <- data.table(x = which(x))
  dty <- data.table(y = which(y), key = "y")
  dty[dtx, abs(x - y), roll = "nearest"] 
}

示例数据：

x <- y <- rep(FALSE, 25)
x[c(3, 8, 10, 15, 17, 25)] <- TRUE
y[c(1, 5, 9, 10, 14, 17, 18, 20)] <- TRUE

X <- rep(x, 100)
Y <- rep(y, 100)

单元测试：

identical(flodel(X, Y), GG1(X, Y))
# [1] TRUE

基准：

library(microbenchmark)
microbenchmark(flodel(X,Y), GG1(X,Y), GG2(X,Y))
# Unit: microseconds
#          expr       min         lq     median        uq        max neval
#  flodel(X, Y)   115.546   131.8085   168.2705   189.069   1980.316   100
#     GG1(X, Y)  2568.045  2828.4155  3009.2920  3376.742  63870.137   100
#     GG2(X, Y) 22210.708 22977.7340 24695.7225 28249.410 172074.881   100

[Matt Dowle编辑] 24695微秒= 0.024秒。使用微小数据在微基准测试上做出的推论很少能够保持有意义的数据大小。

[由flodel编辑]我的向量长度为2500，考虑到泰勒的陈述（10k），这是相当有意义的，但很好，让我们尝试使用长度为2.5e7的向量。我希望你能原谅我在适当的情况下使用system.time：

X <- rep(x, 1e6)
Y <- rep(y, 1e6)
system.time(flodel(X,Y))
#    user  system elapsed 
#   0.694   0.205   0.899 
system.time(GG1(X,Y))
#    user  system elapsed 
#  31.250  16.496 112.967 
system.time(GG2(X,Y))
# Error in `[.data.table`(dty, dtx, abs(x - y), roll = "nearest") : 
#   negative length vectors are not allowed

[Arun编辑] - 使用1.8.11的2.5e7基准：
[Arun编辑2] - 在Matt最近更快二进制搜索/合并后更新时间

require(data.table)
arun <- function(x, y) {
    dtx <- data.table(x=which(x))
    setattr(dtx, 'sorted', 'x')
    dty <- data.table(y=which(y))
    setattr(dty, 'sorted', 'y')
    dty[, y1 := y]
    dty[dtx, roll="nearest"][, abs(y-y1)]
}

# minimum of three consecutive runs
system.time(ans1 <- arun(X,Y))
#   user  system elapsed 
#  1.036   0.138   1.192 

# minimum of three consecutive runs
system.time(ans2 <- flodel(X,Y))
#   user  system elapsed 
#  0.983   0.197   1.221 

identical(ans1, ans2) # [1] TRUE

Answer 2

这是两个解决方案。既不使用循环也不使用apply函数。

1）如果z为1，则第一个与我发布到prior question的解决方案相同，除非此处的简化假设允许我们稍微缩短它，我们将答案相对于那个减少了1个。

library(zoo)

yy <- ifelse(y, TRUE, NA) * seq_along(y)
fwd <- na.locf(yy, fromLast = FALSE)[x]
bck <- na.locf(yy, fromLast = TRUE)[x]
wx <- which(x)
pmin(wx - fwd, bck - wx, na.rm = TRUE)

2）第二个是data.table解决方案。 data.table可以采用roll="nearest"参数，这似乎正是您所需要的：

library(data.table)

dtx <- data.table(x = which(x))
dty <- data.table(y = which(y), key = "y")
dty[dtx, abs(x - y), roll = "nearest"]

我不确定这是否重要，但我使用的是data.table版本1.8.11（CRAN版本目前为1.8.10）。

2个逻辑向量的元素之间的快速最小距离（间隔）（取2）

2 个答案: