我正在尝试修复函数中嵌套for循环的瓶颈。我已经尝试了三个功能,似乎无法修复它。特别是如果它是data.table或rcpp解决方案的任何帮助,我们非常感谢。这是一个包含100个单元格的光栅的示例,但是我有超过1,000,000个单元格,因此速度至关重要。
考虑以下栅格:
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
这是一个小型栅格,只有一百个单元格。我做了以下功能来研究动物运动,在那里我使用光栅和动物可以在时间片中移动的最大距离。嵌套循环存在一个我无法解决的瓶颈。
我得到的回报是具有以下变量的数据框:
来自动物可以从中移动的栅格中的细胞ID
to 动物可以移动到的栅格中的单元格的ID
距离从 到 的距离
加载必要的库
library(gdistance)
library(dplyr)
library(tidyr)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
但我只摆脱了其中一个循环
DistConect2 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
# connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
#For each pair of cells in B
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
但使用microbenchmarck软件包测试差异显示没有任何区别:
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)
DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428 4
DistConect2(r, Distance = 1e+06) 9.892371 10.07783 10.35453 10.41144 10.63124 10.70288 4
CLD 一个 一个
我也尝试了并行化,但实际上需要更长的时间:
DistConect2b <- function (Raster, Distance, cpus = NULL)
{
h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
h16 <- geoCorrection(h16, scl = FALSE)
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
# cat(paste(i, "of", nrow(B)))
}
require(snowfall)
sfInit(parallel=TRUE, cpus=cpus)
sfLibrary(gdistance)
sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
sfStop(nostop=FALSE)
# sep.connections <- lapply(1:nrow(B), all.cells)
connections <- do.call("rbind", sep.connections)
connections <- as.data.frame(connections)
}
结果microbenchmark
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
DistConect2(r, Distance = 1e+06) 9.963549 9.974315 10.01547 10.01173 10.05662
DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809
在我得到了很好的答案之后,我试图更进一步,并添加了一个lapply来替换代码中的for循环:
DistConect4 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
temp <- accCost2(h16, B[i, ])
index <- which(temp < 1000000) # all.index <- function(j){
connections <- cbind(i, index, temp[index])
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
connections <- as.data.frame(do.call("rbind", connections))
#Get everything together as a large data frame
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
使用下面定义的acccost2函数
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
但是当我尝试
时timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)
print(timing, unit = "relative")
它没有使这个过程更快
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
DistConect3(r, Distance = 1e+06) 1.000000 1.00000 1.000000 1.000000 1.000000
DistConect4(r, Distance = 1e+06) 0.997329 1.00141 1.019697 1.002112 1.005626
我认为申请总是比任何想法更快,为什么这会让这个过程更快?
答案 0 :(得分:3)
你可以通过改变
摆脱内循环temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")
进入这个:
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
我也调查了accCost
,这似乎是这里最慢的功能。不幸的是,它调用了一些C代码来寻找最短的路径,这可能意味着没有什么可以优化的。我创建了accCost2
,我删除了一些代码,但我怀疑它有多重要。我也不确定这里的并行化效率如何,因为运行时间不长。下面是一些显示体面改善的基准。
library(gdistance)
library(dplyr)
library(tidyr)
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
# print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
DistConect3 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost2(h16,B[i,])
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)
# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE
timing1 <- microbenchmark(
DistConect1(r, Distance = 1000),
DistConect3(r, Distance = 1000),
times = 10
)
print(timing1, unit = "relative")
# expr min lq mean median uq max neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10 b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
timing2 <- microbenchmark(
DistConect1(r, Distance = 10000),
DistConect3(r, Distance = 10000),
times = 10
)
print(timing2, unit = "relative")
# expr min lq mean median uq max neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569 10 b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a