Question

我有3亿个点要与6000万个多边形相交。这两个的组合要比我可以轻松装入机器内存的大小大。我提出了一个解决方案，其中将每个数据集加载到PostGIS中，对每个数据集执行空间索引，然后执行空间联接。

在PostGIS中，如下所示：

SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);

pts上的空间索引（3亿个点）大约需要90分钟。然后，上述连接大约需要190分钟。

我以前从未用R处理过大于RAM空间的数据。

有没有办法使用sf中的R包来处理这种规模的数据
现有哪些策略可以加快操作速度？
我应该考虑其他工具或方法吗？

我更喜欢使用开源工具（R，PostGIS，Python等）。但是我不致力于任何特定的工具链。

其他数据 似乎我缺乏说明具体的解决方案引起了混乱。我最初没有提供任何语法或示例的原因是我没有加入特定的平台。我愿意使用任何开源堆栈。就像标题中所说的那样，我在文本中重申，这里的问题是规模，而不是解决一个琐碎例子的语法。

这是一个非常特殊的解决方案，使用R中的sf包解决了。以下示例适用于500 km平方和1000个随机点的美国网格。我想将其扩展到1公里以下的网格和300,000,000点。我根本不关心绘图，但下面仅作说明，仅作说明。

library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`

# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000   # number of points to join

data(us_states) # loads the us_states shape

all_states <-
  us_states %>%
  # st_sf() %>%
  st_transform(102003) %>% # project to a meters based projection
  st_combine   %>% #flattens the shape file to one big outline (no states)
  st_buffer(10000) # add a 10k buffer

#a nice outter buffer of the usa lower 48
ggplot() +
  geom_sf(data = all_states)

## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin

cell_dim <-
  c(ceiling(xrange / grid_size),
    ceiling(yrange / grid_size)) # dimension of polygons necessary

full_us_grid <-
  st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
  st_intersection(all_states) %>% # only the inside part
  st_sf() %>%
  mutate(grid_id = 1:n())

ggplot() +
  geom_sf(data = full_us_grid)

## now let's create some random points
random_pts <- data.frame(
  point_id = 1:num_pts,
  lat = runif(num_pts, 30, 50),
  lon = runif(num_pts, -117, -78)
) %>%
  # these are in degrees so need crs in same
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  st_transform(102003)  # transform into our metric crs

ggplot() +
  geom_sf(data = full_us_grid) +
  geom_sf(data = random_pts)

## here is the spatial join!!
joined_data <-
  full_us_grid %>%
  st_join(random_pts)

## this is the mapping from grid_id to point_id
joined_data %>%
  st_set_geometry(NULL) %>%
  na.omit()  %>%
  head
#>     grid_id point_id
#> 7         7       26
#> 7.1       7      322
#> 7.2       7      516
#> 7.3       7      561
#> 7.4       7      641
#> 7.5       7      680

^{由reprex package（v0.2.1）于2018-12-24创建}

Answer 1

在这种情况下（查找哪些点位于矩形单元格内）您可以通过以下方式同时提高速度和减少内存需求使用软件包createTree中的函数SearchTrees构建QuadTree并然后使用其rectLookup函数查找单元格。这样，您既可以节省内存（无需建立多边形网格），又可以增加内存自从构建QuadTreee rectLookup之后的速度非常快，因为它大大减少了要进行的坐标比较的次数。例如：

library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)

data(us_states) # loads the us_states shape
all_states <-
  us_states %>%
  # st_sf() %>%
  st_transform(102003) %>% # project to a meters based projection
  st_combine()   %>% #flattens the shape file to one big outline (no states)
  st_buffer(10000) # add a 10k buffer

# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
  c(ceiling(xrange / grid_size),
    ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
                           ind_row = ind_rows,
                           ind_col = ind_cols,
                           stringsAsFactors = FALSE)

## now let's create some random points - Here I build the points directly in 
## 102003 projection for speed reasons because st_transform() does not scale 
## very well with number of points. If your points are in 4326 you may consider 
## transforming them beforehand and store the results in a RData or gpkg or 
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id 
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
  point_id = 1:num_pts,
  lon = runif(num_pts, state_box$xmin, state_box$xmax),
  lat = runif(num_pts, state_box$ymin, state_box$ymax)
)

# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))

# Define a function which uses `SearchTrees::rectLookup` to find points within 
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.

find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {

  cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
  cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
  cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
  cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])

  cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
                            ymin = cur_ymin, ymax = cur_ymax),
                          crs = sf::st_crs(all_states)) %>%
    sf::st_as_sfc()

  # look for contained points only if the cell intersects with the all_states poly
  if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {

    if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
      # If cell completely contained, use `rectLookup` to find contained points
      pts <-  SearchTrees::rectLookup(
        qtree,
        xlims = c(cur_xmin, cur_xmax),
        ylims = c(cur_ymin, cur_ymax))


    } else {
      # If cell intersects, but is not completely contained (i.e., on borders),
      # limit the rectLookup to the bbox of intersection to speed-up, then find
      # points properly contained
      cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
      pts <-  SearchTrees::rectLookup(
        qtree,
        xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
        ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
      # now we should have "few" points - we can use sf operators - here st_contains
      # is much faster than an intersect. This should be fast even over large
      # number of points if the cells are small
      contained_pts <- sf::st_contains(
        all_states,
        sf::st_as_sf(random_pts[pts,],
                     coords = c("lon", "lat"),
                     crs = sf::st_crs(all_states)))[[1]]
      pts  <- random_pts[pts[contained_pts],][["point_id"]]
    }
    if (length(pts) == 0 ) {
      pts <- as.numeric(NA)
    } else {
      pts  <- random_pts$point_id[pts]
    }
  } else {
    pts <- as.numeric(NA)
  }
  out <- data.table::data.table(
    grid_id  = cell_indexes$grid_id[cell],
    point_id = pts)
  return(out)
}

让我们看看它是否有效

# Run the function through a `lapply` over grid cells

out <- lapply(1:n_cells, FUN = function(x)
  find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#>       grid_id point_id
#>    1:       1       NA
#>    2:       2       NA
#>    3:       3       NA
#>    4:       4      325
#>    5:       4     1715
#>   ---                 
#> 1841:      59     1058
#> 1842:      60      899
#> 1843:      60     2044
#> 1844:      60      556
#> 1845:      60     2420

grd <- sf::st_make_grid(all_states, cellsize = 500000) %>% 
  sf::st_sf() %>% 
  dplyr::mutate(grid_id = 1:60)

id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
  sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"

ggplot2::ggplot(data = grd) +
  geom_sf(data = grd, fill = "transparent") +
  geom_sf_text(aes(label = grid_id)) +
  geom_sf(data = all_states, fill = "transparent") +
  geom_sf(data = sub_pts)

根据我的（有限）经验，这应该可以很好地扩展点/单元并具有相当低的内存占用。此外，它很容易并行化（只要您有足够的内存）。

如果仍然无法在完整数据集上运行它（我无法测试在我的笔记本电脑上），您还可以通过分析其中的点来“拆分”执行 “块”（例如，将它们保存为shp / gpkg，然后仅读取使用query自变量，或另存为lon排序的表中的部分点并阅读前XX行-这可以使您进一步如果您对经度/纬度进行过滤，则可以加快速度，因为那样您还可以自动降低要分析的细胞数量，并节省大量时间。

Answer 2

尝试使用以下链接中所述的云解决方案：

https://blog.sicara.com/speedup-r-rstudio-parallel-cloud-performance-aws-96d25c1b13e2

比内存操作大：使用R进行空间连接

2 个答案: