我有3亿个点要与6000万个多边形相交。这两个的组合要比我可以轻松装入机器内存的大小大。我提出了一个解决方案,其中将每个数据集加载到PostGIS中,对每个数据集执行空间索引,然后执行空间联接。
在PostGIS中,如下所示:
SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);
pts
上的空间索引(3亿个点)大约需要90分钟。然后,上述连接大约需要190分钟。
我以前从未用R
处理过大于RAM空间的数据。
sf
中的R
包来处理这种规模的数据我更喜欢使用开源工具(R,PostGIS,Python等)。但是我不致力于任何特定的工具链。
其他数据 似乎我缺乏说明具体的解决方案引起了混乱。我最初没有提供任何语法或示例的原因是我没有加入特定的平台。我愿意使用任何开源堆栈。就像标题中所说的那样,我在文本中重申,这里的问题是规模,而不是解决一个琐碎例子的语法。
这是一个非常特殊的解决方案,使用R中的sf
包解决了。以下示例适用于500 km平方和1000个随机点的美国网格。我想将其扩展到1公里以下的网格和300,000,000点。我根本不关心绘图,但下面仅作说明,仅作说明。
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`
# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000 # number of points to join
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
#a nice outter buffer of the usa lower 48
ggplot() +
geom_sf(data = all_states)
## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size)) # dimension of polygons necessary
full_us_grid <-
st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
st_intersection(all_states) %>% # only the inside part
st_sf() %>%
mutate(grid_id = 1:n())
ggplot() +
geom_sf(data = full_us_grid)
## now let's create some random points
random_pts <- data.frame(
point_id = 1:num_pts,
lat = runif(num_pts, 30, 50),
lon = runif(num_pts, -117, -78)
) %>%
# these are in degrees so need crs in same
st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
st_transform(102003) # transform into our metric crs
ggplot() +
geom_sf(data = full_us_grid) +
geom_sf(data = random_pts)
## here is the spatial join!!
joined_data <-
full_us_grid %>%
st_join(random_pts)
## this is the mapping from grid_id to point_id
joined_data %>%
st_set_geometry(NULL) %>%
na.omit() %>%
head
#> grid_id point_id
#> 7 7 26
#> 7.1 7 322
#> 7.2 7 516
#> 7.3 7 561
#> 7.4 7 641
#> 7.5 7 680
由reprex package(v0.2.1)于2018-12-24创建
答案 0 :(得分:2)
在这种情况下(查找哪些点位于矩形单元格内)
您可以通过以下方式同时提高速度和减少内存需求
使用软件包createTree
中的函数SearchTrees
构建QuadTree并
然后使用其rectLookup
函数查找单元格。
这样,您既可以节省内存(无需建立多边形网格),又可以增加内存
自从构建QuadTreee rectLookup
之后的速度非常快,因为它
大大减少了要进行的坐标比较的次数。
例如:
library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine() %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
ind_row = ind_rows,
ind_col = ind_cols,
stringsAsFactors = FALSE)
## now let's create some random points - Here I build the points directly in
## 102003 projection for speed reasons because st_transform() does not scale
## very well with number of points. If your points are in 4326 you may consider
## transforming them beforehand and store the results in a RData or gpkg or
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
point_id = 1:num_pts,
lon = runif(num_pts, state_box$xmin, state_box$xmax),
lat = runif(num_pts, state_box$ymin, state_box$ymax)
)
# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))
# Define a function which uses `SearchTrees::rectLookup` to find points within
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.
find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {
cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])
cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
ymin = cur_ymin, ymax = cur_ymax),
crs = sf::st_crs(all_states)) %>%
sf::st_as_sfc()
# look for contained points only if the cell intersects with the all_states poly
if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {
if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
# If cell completely contained, use `rectLookup` to find contained points
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_xmin, cur_xmax),
ylims = c(cur_ymin, cur_ymax))
} else {
# If cell intersects, but is not completely contained (i.e., on borders),
# limit the rectLookup to the bbox of intersection to speed-up, then find
# points properly contained
cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
# now we should have "few" points - we can use sf operators - here st_contains
# is much faster than an intersect. This should be fast even over large
# number of points if the cells are small
contained_pts <- sf::st_contains(
all_states,
sf::st_as_sf(random_pts[pts,],
coords = c("lon", "lat"),
crs = sf::st_crs(all_states)))[[1]]
pts <- random_pts[pts[contained_pts],][["point_id"]]
}
if (length(pts) == 0 ) {
pts <- as.numeric(NA)
} else {
pts <- random_pts$point_id[pts]
}
} else {
pts <- as.numeric(NA)
}
out <- data.table::data.table(
grid_id = cell_indexes$grid_id[cell],
point_id = pts)
return(out)
}
让我们看看它是否有效
# Run the function through a `lapply` over grid cells
out <- lapply(1:n_cells, FUN = function(x)
find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#> grid_id point_id
#> 1: 1 NA
#> 2: 2 NA
#> 3: 3 NA
#> 4: 4 325
#> 5: 4 1715
#> ---
#> 1841: 59 1058
#> 1842: 60 899
#> 1843: 60 2044
#> 1844: 60 556
#> 1845: 60 2420
grd <- sf::st_make_grid(all_states, cellsize = 500000) %>%
sf::st_sf() %>%
dplyr::mutate(grid_id = 1:60)
id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"
ggplot2::ggplot(data = grd) +
geom_sf(data = grd, fill = "transparent") +
geom_sf_text(aes(label = grid_id)) +
geom_sf(data = all_states, fill = "transparent") +
geom_sf(data = sub_pts)
根据我的(有限)经验,这应该可以很好地扩展 点/单元并具有相当低的内存占用。此外,它很容易并行化(只要您 有足够的内存)。
如果仍然无法在完整数据集上运行它(我无法测试
在我的笔记本电脑上),您还可以通过分析其中的点来“拆分”执行
“块”(例如,将它们保存为shp / gpkg,然后仅读取
使用query
自变量,或另存为lon排序的表中的部分点
并阅读前XX行-这可以使您进一步
如果您对经度/纬度进行过滤,则可以加快速度,因为那样您还可以自动降低
要分析的细胞数量,并节省大量时间。
答案 1 :(得分:-1)