我怎么能加快以下的循环?

时间:2014-03-06 10:14:59

标签: r performance

rm(list = ls())
a <- seq(from = 1, to = 50000, by = 1)
b <- seq(from = 1, to = 10000, by = 2)
c <- seq(from = 1, to = 10000, by = 3)

two <- rep(NA, length(a))
three <- rep(NA, length(a))

system.time(
  for (i in seq_along(a))
  {
    if (length(tail(which(a[i] > b),1)) != 0 & length(tail(which(a[i] > c),1)) != 0)
    {    
      two[i] <- tail(which(a[i] > b),1)
      three[i] <- tail(which(a[i] > c),1)
    }
    else    
    {    
      two[i] <- NA
      three[i] <- NA
    }
  } 
)
build_b <- b[two]
build_c <- c[three]

我要做的是找到bca的样子。所以我在向量twothree中预先定位内存以试图节省一些时间,因此我可以跟踪这些事件的索引。循环完成后,我根据刚刚计算的索引构建新的向量。目前,该操作需要大约10秒来计算。我的问题是如何加快这项操作?

谢谢!

2 个答案:

答案 0 :(得分:3)

以下是使用findInterval的另一种解决方案:

## assume a, b and c are sorted
two <- findInterval(a-1L, b)
three <- findInterval(a-1L, c)

two[two==0] <- NA
three[three==0] <- NA

build_b <- b[two]
build_c <- c[three]

这里有一点基准:

a <- seq(from = 1, to = 50000, by = 1)
b <- seq(from = 1, to = 10000, by = 2)
c <- seq(from = 1, to = 10000, by = 3)

pops <- function(a, b, c) {

  two <- rep(NA, length(a))
  three <- rep(NA, length(a))

  for (i in seq_along(a))
  {
    if (length(tail(which(a[i] > b),1)) != 0 & length(tail(which(a[i] > c),1)) != 0)
    {    
      two[i] <- tail(which(a[i] > b),1)
      three[i] <- tail(which(a[i] > c),1)
    }
    else    
    {    
      two[i] <- NA
      three[i] <- NA
    }
  } 
  return(list(b=b[two], c=c[three]))
}

droopy <- function(a, b, c) {

  two <- rep(NA, length(a))
  three <- rep(NA, length(a))

  for (i in seq_along(a))
  {
    if (any(u <- (a[i] > b)) & any(v <- (a[i] > c)))
    {    
      two[i] <- sum(u)
      three[i] <- sum(v)
    }
    else    
    {    
      two[i] <- NA
      three[i] <- NA
    }
  }
  return(list(b=b[two], c=c[three]))
}

sgibb <- function(a, b, c) {
  ## assume a, b and c are sorted
  two <- findInterval(a-1L, b)
  three <- findInterval(a-1L, c)

  two[two==0] <- NA
  three[three==0] <- NA

  return(list(b=b[two], c=c[three]))
}

基准:

library("rbenchmark")
benchmark(pops(a, b, c), droopy(a, b, c), sgibb(a, b, c), order="relative", replications=2)
#             test replications elapsed relative user.self sys.self user.child sys.child
#3  sgibb(a, b, c)            2   0.010      1.0     0.008    0.004          0         0
#2 droopy(a, b, c)            2   8.639    863.9     8.613    0.000          0         0
#1   pops(a, b, c)            2  26.838   2683.8    26.753    0.004          0         0

identical(pops(a, b, c), sgibb(a, b, c))
## TRUE
identical(droopy(a, b, c), sgibb(a, b, c))
## TRUE

答案 1 :(得分:1)

一种可能性:

a <- seq(from = 1, to = 50000, by = 1)
b <- seq(from = 1, to = 10000, by = 2)
c <- seq(from = 1, to = 10000, by = 3)

two <- integer(length(a))
three <- integer(length(a))

system.time(
{
  for (i in seq_along(a))
  {
    if (any(u <- (a[i] > b)) & any(v <- (a[i] > c)))
    {    
      two[i] <- sum(u)
      three[i] <- sum(v)
    }
    else    
    {    
      two[i] <- NA
      three[i] <- NA
    }
  }
})

build_b <- b[two]
build_c <- c[three]