R提高代码性能

时间:2015-03-04 23:04:20

标签: r

给定一个数据表(A),其坐标标识节点质心的位置,另一个数据表(B),其坐标标识采样样本环境数据的位置,如何确定给定节点B中的位置在A中最接近,不使用for循环?目前,我可以做到以下几点:

示例

#Locations of nodes of interest
node_latitude = seq(33, 36.6, by=0.4)
node_longitude = seq(-53, -56.6, by=-0.4)
node_longitude_mat = matrix(NA, 10, 10)
node_coordinates = vector("list")
for(i in 1:length(node_latitude)) {
  node_longitude_mat[,i] = rep(node_longitude[i],nrow(node_longitude_mat))
  node_coordinates[[i]] = cbind(node_latitude, node_longitude_mat[,i])
}

node_locations = as.data.frame(do.call("rbind", node_coordinates), 
                              stringsAsFactors = FALSE)
colnames(node_locations) = c("LATITUDE", "LONGITUDE")
plot(node_locations)

#Sampling locations providing environmental data, e.g., temperature
#Locations of nodes of interest
sample_values_01 = runif(100, -10, 20)
sample_values_02 = runif(100, -10, 20)
sample_values_03 = runif(100, -10, 20)
samp_latitude = 30:39
samp_longitude = -50:-59
samp_longitude.mat = matrix(NA, 10, 10)
samp_coordinates = vector("list")
for(i in 1:length(samp_latitude)) {
  samp_longitude.mat[,i] = rep(samp_longitude[i],nrow(samp_longitude.mat))
  samp_coordinates[[i]] = cbind(samp_latitude, samp_longitude.mat[,i])
}

samp_locations = as.data.frame(do.call("rbind", samp_coordinates), 
                          stringsAsFactors = FALSE)
samp_locations = cbind(samp_locations, sample_values_01, sample_values_02, 
                       sample_values_03)
colnames(samp_locations) = c("LATITUDE", "LONGITUDE", "TEMPERATURE_01", 
                             "TEMPERATURE_02", "TEMPERATURE_03")
plot(samp_locations$LATITUDE~samp_locations$LONGITUDE)
points(node_locations$LATITUDE~node_locations$LONGITUDE, pch = 2)

require(geosphere)
test = vector()
value_coordinates = vector("list")
for(i in 1:length(node_locations$LONGITUDE)) {
  for(j in 1:length(samp_locations$LONGITUDE)) {
    test[j] = distHaversine(p1=c(node_locations$LONGITUDE[i], 
                              node_locations$LATITUDE[i]), 
                         p2=c(samp_locations$LONGITUDE[j],
                              samp_locations$LATITUDE[j]))
  }
  value_coordinates[[i]] = c(node_locations[i,], 
                             samp_locations[which.min(test),3:5])
}

table_test = as.data.frame(do.call("rbind", value_coordinates), 
                               stringsAsFactors = FALSE)
colnames(table_test) = colnames(samp_locations)
require(data.table)
table_test = data.table(table_test)

哪个适用于10x10表。这会很麻烦,包含数百或数千行和列。

1 个答案:

答案 0 :(得分:0)

这是一个解决方案,通过单独使用outer()经度来避免循环。

delta_LAT <- outer(node_locations$LATITUDE, samp_locations$LATITUDE,`-`)
delta_LON <- outer(node_locations$LONGITUDE, samp_locations$LONGITUDE,`-`)
distance <- (delta_LAT^2) + (delta_LAT^2)
# index of the nearest node to each sample
apply(distance,2,which.min)

你可以将它实现为一个像这样的单线:

function(x1,y1,x2,y2)
    apply(outer(x2, x1[indx],`-`)^2 +
          outer(y2, y1[indx],`-`)^2,2,which.min))

这有一个主要的复杂因素,即outer()可能会尝试分配比可用内存更多的空间。您可以通过将作业分解为块以响应内存限制来处理此问题,如下所示:

f <- function(x1,y1,x2,y2,chunkSize=length(x1)){
    stopifnot(length(x1) == length(y1))
    stopifnot(length(x2) == length(y2))
    force(chunkSize)
    incomplete = TRUE

    # define an inner function here so we can use 
    # scoping assigment (`<<-`) to extend the variable 
    # `out` from within the call to `tryCatch()` 
    inner <- function(i,chunkSize){
        indx <- seq.int(chunkSize * (i-1) + 1,
                        min(chunkSize*i,length(x1)))
        out <<- c(out,
                 apply(outer(x2, x1[indx],`-`)^2 +
                       outer(y2, y1[indx],`-`)^2,2,which.min))
    }
    while(incomplete){
        out <- integer(0)
        cat('chunk size:',chunkSize,'\n')
        tryCatch( {
            for(i in seq.int(ceiling(length(x1)/chunkSize)))
                inner(i,chunkSize)
            incomplete=FALSE
        },error=function(e){
            if(!grepl("^cannot allocate vector of size",e$message))
                # re-raise any error other than memory limit errors
                browser()#;stop(e)
            print('too big')
            if(chunkSize > 1)
                chunkSize <<- floor(chunkSize/2)
            else
                stop('cannot allocate memory for even one comparison')
        })
    }
    return(out)
}

每张表有1000×10,000行,这项工作需要不到一秒钟。

x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(10000)
y2 <- runif(10000)  
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2)})
#> user  system elapsed 
#> 0.72    0.11    0.87 

每个表1000行123,456行,该作业大约需要13秒

x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(123456)
y2 <- runif(123456) 
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2)})
#> user  system elapsed 
#> 9.41    3.10   12.61 

并且您可以通过预先指定chunkSize

来节省大约一秒钟
x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(123456)
y2 <- runif(123456) 
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2,chunkSize=250)})
#> user  system elapsed 
#> 8.69    2.59   11.61 

这当然使用经度和纬度上的欧几里德度量,如果您的采样点靠近极点,可能会有些不准确,因此您可能希望使用不同的度量标准,具体取决于您的数据位置... < / p>