Question

我有一个聚合操作，该操作对R中多边形的点进行计数，该操作当前使用plyr :: ddply（）作为主要功能，在这里我需要按2个变量进行分组：dayofweek和hour。它非常慢，所以我想用一个更快的函数代替它，例如data.table :: package中的东西。

Reprex

创建数据框

该操作的主要目的是获取点 df 的数据框，并使用sf ::包中的st_intersects方法计算网格中与多边形相交的点数.sf。

创建DF对象

library(sf)
library(tidyverse)
library(plyr)

df <- data.frame(X = seq(1,100,1), 
                 dayofweek = rep(c("Sun", "Mon", "Tues", "Wed", "Thur"), 20),
                 hour = sample(seq(0, 23, 1),  100, replace = T),
                 lat = sample(seq(37.1234, 37.2345, 0.001),  100, replace = T),
                 lon = sample(seq(-122.5432, -122.4111, 0.001),  100, replace = T)
)


projcrs <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

df <- st_as_sf(x = df,                         
               coords = c("lon", "lat"),
               crs = projcrs)

创建grid.sf对象

# Function to create the grid we need
buildBaseGrid <- function(x) {

  # create a 500m tesseract over these points
  g <- st_make_grid(x, cellsize = 0.005)

  # plot to make sure
  # ggplot() +
  #   geom_sf(data = df.sf, size = 3) +
  #   geom_sf(data = g, alpha = 0)
  # 
  # ggplot() +
  #   geom_sf(data = g, alpha = 0)
  grid.sf <- st_sf(g) 
  # Need to expand by day of week
  days <- c('SU', 'MO', 'TU', 'WE', 'TH', 'FR', 'SA')
  hours <- c('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
             '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23')
  grid.sf <- expand.grid(g, days, hours)
  grid.sf$id <- 1:nrow(grid.sf)

  #### Clean up home grid

  # Calc centroid
  grid.sf <- grid.sf %>%
    dplyr::mutate(center = sf::st_centroid(grid.sf$Var1))

  # Parse out lat and lon
  grid.sf <- grid.sf %>%
    dplyr::mutate(lonn = sf::st_coordinates(grid.sf$center)[,1]) %>%
    dplyr::mutate(latt = sf::st_coordinates(grid.sf$center)[,2])

  # Create primary key field
  grid.sf <- grid.sf %>%
    dplyr::mutate(pkey = paste0(lonn,";",latt,";",Var2,";",Var3))


  grid.sf <- st_as_sf(grid.sf) 

  return(grid.sf)


}

# Now build the grid.sf object
grid.sf <- buildBaseGrid(df)

plyr :: ddply当前操作

# Create function to use in operation
myf <- function(x) {
  x <- as.data.frame(x)
  df <- df %>% dplyr::filter(dayofweek %in% x$Var2)
  df <- df %>% dplyr::filter(hour %in% x$Var3)
  x$count <- sf::st_intersects(x$Var1, df) %>% lengths()
  x %>%
    data.frame(x)
}

# Do the operation
output <-  plyr::ddply(grid.sf, .(Var2, Var3), .fun = myf, .parallel = F) %>% as.data.frame()

这在我的机器上花费了大约4秒钟，但是我必须执行该过程数百次才能真正实现。

我尝试从data.table开始，但发现翻译困难。这是我为此data.table尝试所拥有的唯一（不起作用的）代码：

test4 <- grid.sf[, .(test = myf()), by = key(grid.sf)]

因此，如果有一种方法可以将此ddply转换为data.table操作，则由于data.table快得多，所以我会很高兴。

谢谢！

如何用data.table替换plyr :: ddply

Reprex

创建数据框

plyr :: ddply当前操作

0 个答案: