给定一个数据表(A),其坐标标识节点质心的位置,另一个数据表(B),其坐标标识采样样本环境数据的位置,如何确定给定节点B中的位置在A中最接近,不使用for循环?目前,我可以做到以下几点:
示例
#Locations of nodes of interest
node_latitude = seq(33, 36.6, by=0.4)
node_longitude = seq(-53, -56.6, by=-0.4)
node_longitude_mat = matrix(NA, 10, 10)
node_coordinates = vector("list")
for(i in 1:length(node_latitude)) {
node_longitude_mat[,i] = rep(node_longitude[i],nrow(node_longitude_mat))
node_coordinates[[i]] = cbind(node_latitude, node_longitude_mat[,i])
}
node_locations = as.data.frame(do.call("rbind", node_coordinates),
stringsAsFactors = FALSE)
colnames(node_locations) = c("LATITUDE", "LONGITUDE")
plot(node_locations)
#Sampling locations providing environmental data, e.g., temperature
#Locations of nodes of interest
sample_values_01 = runif(100, -10, 20)
sample_values_02 = runif(100, -10, 20)
sample_values_03 = runif(100, -10, 20)
samp_latitude = 30:39
samp_longitude = -50:-59
samp_longitude.mat = matrix(NA, 10, 10)
samp_coordinates = vector("list")
for(i in 1:length(samp_latitude)) {
samp_longitude.mat[,i] = rep(samp_longitude[i],nrow(samp_longitude.mat))
samp_coordinates[[i]] = cbind(samp_latitude, samp_longitude.mat[,i])
}
samp_locations = as.data.frame(do.call("rbind", samp_coordinates),
stringsAsFactors = FALSE)
samp_locations = cbind(samp_locations, sample_values_01, sample_values_02,
sample_values_03)
colnames(samp_locations) = c("LATITUDE", "LONGITUDE", "TEMPERATURE_01",
"TEMPERATURE_02", "TEMPERATURE_03")
plot(samp_locations$LATITUDE~samp_locations$LONGITUDE)
points(node_locations$LATITUDE~node_locations$LONGITUDE, pch = 2)
require(geosphere)
test = vector()
value_coordinates = vector("list")
for(i in 1:length(node_locations$LONGITUDE)) {
for(j in 1:length(samp_locations$LONGITUDE)) {
test[j] = distHaversine(p1=c(node_locations$LONGITUDE[i],
node_locations$LATITUDE[i]),
p2=c(samp_locations$LONGITUDE[j],
samp_locations$LATITUDE[j]))
}
value_coordinates[[i]] = c(node_locations[i,],
samp_locations[which.min(test),3:5])
}
table_test = as.data.frame(do.call("rbind", value_coordinates),
stringsAsFactors = FALSE)
colnames(table_test) = colnames(samp_locations)
require(data.table)
table_test = data.table(table_test)
哪个适用于10x10表。这会很麻烦,包含数百或数千行和列。
答案 0 :(得分:0)
这是一个解决方案,通过单独使用outer()
经度来避免循环。
delta_LAT <- outer(node_locations$LATITUDE, samp_locations$LATITUDE,`-`)
delta_LON <- outer(node_locations$LONGITUDE, samp_locations$LONGITUDE,`-`)
distance <- (delta_LAT^2) + (delta_LAT^2)
# index of the nearest node to each sample
apply(distance,2,which.min)
你可以将它实现为一个像这样的单线:
function(x1,y1,x2,y2)
apply(outer(x2, x1[indx],`-`)^2 +
outer(y2, y1[indx],`-`)^2,2,which.min))
这有一个主要的复杂因素,即outer()
可能会尝试分配比可用内存更多的空间。您可以通过将作业分解为块以响应内存限制来处理此问题,如下所示:
f <- function(x1,y1,x2,y2,chunkSize=length(x1)){
stopifnot(length(x1) == length(y1))
stopifnot(length(x2) == length(y2))
force(chunkSize)
incomplete = TRUE
# define an inner function here so we can use
# scoping assigment (`<<-`) to extend the variable
# `out` from within the call to `tryCatch()`
inner <- function(i,chunkSize){
indx <- seq.int(chunkSize * (i-1) + 1,
min(chunkSize*i,length(x1)))
out <<- c(out,
apply(outer(x2, x1[indx],`-`)^2 +
outer(y2, y1[indx],`-`)^2,2,which.min))
}
while(incomplete){
out <- integer(0)
cat('chunk size:',chunkSize,'\n')
tryCatch( {
for(i in seq.int(ceiling(length(x1)/chunkSize)))
inner(i,chunkSize)
incomplete=FALSE
},error=function(e){
if(!grepl("^cannot allocate vector of size",e$message))
# re-raise any error other than memory limit errors
browser()#;stop(e)
print('too big')
if(chunkSize > 1)
chunkSize <<- floor(chunkSize/2)
else
stop('cannot allocate memory for even one comparison')
})
}
return(out)
}
每张表有1000×10,000行,这项工作需要不到一秒钟。
x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(10000)
y2 <- runif(10000)
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2)})
#> user system elapsed
#> 0.72 0.11 0.87
每个表1000行123,456行,该作业大约需要13秒
x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(123456)
y2 <- runif(123456)
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2)})
#> user system elapsed
#> 9.41 3.10 12.61
并且您可以通过预先指定chunkSize
来节省大约一秒钟x1 <- runif(1000)
y1 <- runif(1000)
x2 <- runif(123456)
y2 <- runif(123456)
system.time({a = f(x1=x1, x2=x2, y1=y1, y2=y2,chunkSize=250)})
#> user system elapsed
#> 8.69 2.59 11.61
这当然使用经度和纬度上的欧几里德度量,如果您的采样点靠近极点,可能会有些不准确,因此您可能希望使用不同的度量标准,具体取决于您的数据位置... < / p>