有很好的教程可以映射休斯顿犯罪数据,但没有简单的例子来说明如何清理HPD提供的原始数据。 https://github.com/hadley/ggplot2/wiki/Crime-in-Downtown-Houston,-Texas-:-Combining-ggplot2-and-Google-Maps
d <- structure(list(BlockRange = c("5400-5499", "3700-3799", "2200-2299",
"1000-1099", "1200-1299", "UNK", "1900-1999", "500-599", "1200-1299"
), StreetName = c("BELL", "BELL", "BELL", "BELL", "BELL", "BELL",
"BELL", "BELL", "BELL"), Date = c("4/28/2015", "4/11/2015", "4/26/2015",
"4/9/2015", "4/9/2015", "4/21/2015", "4/26/2015", "4/26/2015",
"4/17/2015")), row.names = c(60L, 75L, 88L, 4972L, 4990L, 5096L,
5098L, 5099L, 5155L), class = "data.frame", .Names = c("BlockRange",
"StreetName", "Date"))
这将返回Lon和Lat:
x <- gGeoCode("1950 Bell St, Houston, TX")
#[1] 29.74800 -95.35926
但是,它需要一个函数来对整个数据库进行地理编码,并为Lon和Lat
添加列选择完成数据的示例。
structure(list(address = c("9650 marlive ln", "4750 telephone rd",
"5050 wickview ln", "1050 ashland st", "8350 canyon", "9350 rowan ln",
"2550 southmore blvd", "6350 rupley cir", "5050 georgi ln", "10750 briar forest dr"
), lon = c(-95.4373883, -95.2988769, -95.455864, -95.4033373,
-95.3779081, -95.5483009, -95.3733977, -95.3156032, -95.4665841,
-95.565934), lat = c(29.6779015, 29.6917121, 29.5992174, 29.7902425,
29.6706341, 29.7022336, 29.7198936, 29.6902746, 29.8297359, 29.747596
)), row.names = 82729:82738, class = "data.frame", .Names = c("address",
"lon", "lat"))
以下是地理编码的功能:
library(RCurl)
library(RJSONIO)
library(dplyr)
library(gdata)
construct.geocode.url <- function(address, return.call = "json", sensor = "false") {
root <- "http://maps.google.com/maps/api/geocode/"
u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, sep = "")
return(URLencode(u))
}
gGeoCode <- function(address,verbose=FALSE) {
if(verbose) cat(address,"\n")
u <- construct.geocode.url(address)
doc <- getURL(u)
x <- fromJSON(doc,simplify = FALSE)
if(x$status=="OK") {
lat <- x$results[[1]]$geometry$location$lat
lng <- x$results[[1]]$geometry$location$lng
return(c(lat, lng))
} else {
return(c(NA,NA))
}
}
我们如何使用dplyr或另一种方法编写一个函数,用[address,long,lat]的输出再添加另外3列?
即..
data.frame <- mutate(d, address = ConvertBlockRange(BlockRange) + StreetName, "Houston, TX"), Lon = geocode(address)[0] , lat = geocode(address)[1])
这是问题的阻碍点:
#function to convert - "2200-2299" to integer 2250.. i.e find the middle of the block.
library(stringr)
ConvertBlockRange <- function(blockRange){
m <- unlist(str_split(d$BlockRange, "-"))
m2 <- mean(c(as.numeric(m[1]),as.numeric(m[2]))) + .5
m2
}
答案 0 :(得分:2)
您可以通过分割范围和平均来计算平均块范围:
e.g。
x <- '5400-5499'
mean(as.numeric(strsplit(x, '-')[[1]])) # 5449.5
要进行扩展,我们可以使用separate
包中的tidyr
。这会做一些很酷的事情,比如自动将blockrange的最小值/最大值放入一个新列,将类型从字符串转换为数字(convert=T, type.convert=as.numeric
)。我先filter
输出“UNK”地址 - 你必须单独处理它们。
library(dplyr)
library(tidyr)
d %>%
filter(BlockRange != "UNK") %>%
# this is a df with blockmin & blockmax
separate(BlockRange, c("blockmin", "blockmax"), sep = "-",
convert=T, type.convert=as.numeric, remove=FALSE) %>%
# calc average (round down) and address
mutate(block=floor((blockmin + blockmax)/2),
address=paste(block, StreetName))
# BlockRange blockmin blockmax StreetName Date block address
# 1 5400-5499 5400 5499 BELL 4/28/2015 5449 5449 BELL
# 2 3700-3799 3700 3799 BELL 4/11/2015 3749 3749 BELL
# 3 2200-2299 2200 2299 BELL 4/26/2015 2249 2249 BELL
# 4 1000-1099 1000 1099 BELL 4/9/2015 1049 1049 BELL
# 5 1200-1299 1200 1299 BELL 4/9/2015 1249 1249 BELL
# 6 1900-1999 1900 1999 BELL 4/26/2015 1949 1949 BELL
# 7 500-599 500 599 BELL 4/26/2015 549 549 BELL
# 8 1200-1299 1200 1299 BELL 4/17/2015 1249 1249 BELL
然后您可以%>% group_by(address)
获取唯一地址和地理编码(虽然我会考虑如何限制最大数量的请求等)。
关于一次添加输出lat和lon列,我不认为dplyr会这样做(参见this feature request)。
如果你真的想在这里使用dplyr
语法,最好的办法就是改变gGeoCode
以便它被矢量化,例如。
gGeoCode2 <- function (addresses) {
x <- data.frame(t(sapply(addresses[[1]], gGeoCode)), row.names=NULL)
names(x) <- c('lat', 'lng')
x
}
d2 %>%
select(address) %>%
gGeoCode2 %>%
bind_cols(d2, .)
但我真的认为你应该跳过这个特定步骤的dplyr
糖并做一个手动循环并cbind
结果,这样可以更好地控制请求限制。