R中的双循环操作(以示例为例)

时间:2014-12-06 07:10:38

标签: r performance for-loop

请看以下小工作示例:

#### Pseudo data
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
 R <- 6371 # Earth mean radius [km]
 delta.lon <- (lon2 - lon1)
 delta.lat <- (lat2 - lat1)
 a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
 c <- 2 * asin(min(1,sqrt(a)))
 d = R * c
 return(d)
}

ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
ndistance <- nobs1*nobs2 # The number of distances
mydistance <- vector(mode = "numeric", length = ndistance)

k=1
for (i in 1:nobs1) {
 for (j in 1:nobs2) {
  mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
  k=k+1
 }
}

proc.time() - ptm

计算时间:

  user  system elapsed 
249.85    0.16  251.18

在这里,我的问题是是否仍有加速双循环计算的空间。非常感谢你。

5 个答案:

答案 0 :(得分:5)

这是一个选项,可以在我的机器上将运行时间减少到~2秒,因为它的一部分是矢量化的。

直接与原始解决方案进行比较。

测试数据:

nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

原始解决方案:

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d)
}

ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
ndistance <- nobs1*nobs2 # The number of distances
mydistance <- vector(mode = "numeric", length = ndistance)

k=1
for (i in 1:nobs1) {
  for (j in 1:nobs2) {
    mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
    k=k+1
  }
}

proc.time() - ptm
   User      System     elapsed 
148.243       0.681     148.901 

我的方法:

# modified (vectorized) distance function:
thedistance2 <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(pmin(1,sqrt(a)))   # pmin instead of min
  d = R * c
  return(d)
}

ptm2 <- proc.time()

lst <- vector("list", length = nobs1)

for (i in seq_len(nobs1)) {
    lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
}

res <- unlist(lst)

proc.time() - ptm2
   User      System     elapsed
  1.988       0.331       2.319 

结果是否相等?

all.equal(mydistance, res)
#[1] TRUE

答案 1 :(得分:2)

以下是使用Rcpp的其他方法。在我的机器上,它比beginneR的非常漂亮的矢量化版本略快。

library(Rcpp)

nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

sourceCpp("dist.cpp")

system.time({
  lst <- vector("list", length = nobs1)
  for (i in seq_len(nobs1)) {
    lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
  }
  res <- unlist(lst)
})

##  user  system elapsed 
## 4.636   0.084   4.737 


system.time(res2 <- dist_vec(mylon1, mylat1, mylon2, mylat2))
##  user  system elapsed 
## 2.584   0.044   2.633 

all.equal(res, res2)
## TRUE 

文件dist.cpp计算

#include <Rcpp.h>
#include <iostream>
#include <math.h>

using namespace Rcpp;

double dist_cpp(double lon1, double lat1,
                double lon2, double lat2){
  int R = 6371; 
  double delta_lon = (lon2 - lon1);
  double delta_lat = (lat2 - lat1);
  double a = pow(sin(delta_lat/2.0), 2) + cos(lat1) * cos(lat2) * pow(sin(delta_lon/2.0), 2);
  double c;
  a = sqrt(a);
  if (a < 1.0) {
    c = 2 * asin(a);
  } else {
    c = 2 * asin(1);
  }
  double d = R * c;
  return(d);

}

// [[Rcpp::export]]
NumericVector dist_vec(NumericVector lon1, NumericVector lat1, 
                       NumericVector lon2, NumericVector lat2) {
  int n = lon1.size();
  int m = lon2.size();
  int k = n * m;
  NumericVector res(k);
  int c = 0;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < m; j++) {
      res[c] = dist_cpp(lon1[i], lat1[i], lon2[j], lat2[j]);
      c++;
    }
  }
  return(res);
}

答案 2 :(得分:1)

我在这里运行了3次,一次加载库compiler(在基础R中)并运行enableJIT(3),并在编译后运行(使用enableJITthedistance的{​​{1}}功能(cmpfun()中)。结果是(分别):

compiler

与2和3的区别是可以忽略的,因为我认为> proc.time() - ptm user system elapsed 335.26 0.17 339.83 > proc.time() - ptm user system elapsed 120.73 0.12 122.52 > proc.time() - ptm user system elapsed 117.86 0.12 118.95 只是编译函数,但看起来单独使用enableJIT有一个不错的改进。所以扔掉

enableJIT

library(compiler)

位于代码顶部。

可以找到更多信息here

答案 3 :(得分:1)

library(compiler)
enableJIT(3)

nobs1 <- 4000
nobs2 <- 4000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

trixie<-matrix(nrow=length(mylon1),ncol=4)
trixie[,1]<-mylon1
trixie[,2]<-mylat1
trixie[,3]<-mylon2
trixie[,4]<-mylat2

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d)
}
calc_distance <- cmpfun(thedistance)
ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
distances<-apply(trixie,1, function(x) {
  return(apply(trixie,1, function(y) {
    return(calc_distance(x[1],x[2],y[3],y[4]))
  }))
})

proc.time() - ptm

答案 4 :(得分:1)

这里有两个解决方案,一个是部分矢量化的,使用sapply,我认为另一个是完全矢量化的,但是我已经为部分矢量计算写了几个函数,所以可能有更好的方法。

我也只为nobs1 = 4, 40 & 400nobs2 = 5, 50 & 500运行了这些内容,因为我的笔记本电脑在使用large matrices时难以记忆。

解决方案 - sapply

R <- 6371 # Earth mean radius [km]

delta.lon <- sapply(mylon2, "-", mylon1)
delta.lon <- sin(delta.lon/2)^2

coslat2 <- cos(mylat2)
coslat1 <- cos(mylat1)

delta.lan <- sapply(mylat2, "-", mylat1)
delta.lan <- sin(delta.lan/2)^2

a <- delta.lan + t(sapply(coslat1, "*", coslat2) * t(delta.lon))
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d

f <- c(t(e))

并检查输出

sum(f)
sum(mydistance)

all.equal(f, mydistance)


> sum(f)
[1] 647328856
> sum(mydistance)
[1] 647328856
> all.equal(f, mydistance)
[1] TRUE

解决方案 - 功能

R <- 6371 # Earth mean radius [km]

#function to calculate the difference between each
#element of different sized vectors and return a matrix
vecEleDif <- function(vec1, vec2)
{
    #the order of arguments is vec2 - vec1
    len1 <- length(vec1);len2 <- length(vec2)
    dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
    dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
    res <- t(dummy2 - t(dummy1))
}

#Function to calculate the product of each 
#element of two different size vectors
vecEleProd <- function(vec1, vec2)
{
    #the order of the arguments is vec2 * vec1
    len1 <- length(vec1); len2 <- length(vec2)
    dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
    dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
    res <- t(dummy2 * t(dummy1))
}

ptm <- proc.time()

delta.lon <- sin(vecEleDif(mylon2, mylon1)/2)^2
delta.lan <- sin(vecEleDif(mylat2, mylat1)/2)^2
cosprod <- vecEleProd(cos(mylat1), cos(mylat2))

a <- delta.lan + (t(cosprod) * delta.lon)
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d
f <- c((e))

proc.time() - ptm

并检查输出:

sum(f)
sum(mydistance)

all.equal(f, mydistance)

> sum(f)
[1] 647745044
> sum(mydistance)
[1] 647745044
> 
> all.equal(f, mydistance)
[1] TRUE