想要将每个邮政编码分配到最近的ghcnd气象采集站。使用library(zipcode)
和NOAA提供的ghcnd电台列表(ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/)
尝试使用dplyr
;使用rowwise %>% mutate()
。所有行都从查找表中分配了相同的值。
#small selection of zip codes from library(zipcode)
zip_samp <- "zip latitude longitude
30002 33.77212 -84.26491
30003 33.96035 -84.03786
30004 34.11918 -84.30292
30005 34.08004 -84.21929
39885 31.71000 -84.34000
39886 31.73000 -84.60000
39897 30.90000 -84.32000
39901 33.89125 -84.07456
"
zip <- read.table(text=zip_samp, header=TRUE)
#two example stations
station_samp <- "id lat long
US1GADK0015 33.7794 -84.2572
US1GAGW0024 33.8885 -84.0998
"
stations <- read.table(text=station_samp, header=TRUE)
通过硬编码说明所需的输出:
as.character(stations[which.min(distGeo(c(-84.26491, 33.77212), select(stations, long, lat))), "id"])
[1] "US1GADK0015"
as.character(stations[which.min(distGeo(c(-84.07456, 33.89125), select(stations, long, lat))), "id"])
[1] "US1GAGW0024"
请注意,这两个邮政编码分配给不同的工作站ID,但是当使用dplyr
按行方式应用相同的公式时,所有邮政编码都会分配给一个ID。
assigned <- zip %>%
select(longitude, latitude) %>%
rowwise() %>%
mutate(station =
as.character(stations[which.min(distGeo(., select(stations, long, lat))), "id"])
)
print(assigned)
Source: local data frame [8 x 3]
Groups: <by row>
# A tibble: 8 x 3
longitude latitude station
<dbl> <dbl> <chr>
1 -84.26491 33.77212 US1GADK0015
2 -84.03786 33.96035 US1GADK0015
3 -84.30292 34.11918 US1GADK0015
4 -84.21929 34.08004 US1GADK0015
5 -84.34000 31.71000 US1GADK0015
6 -84.60000 31.73000 US1GADK0015
7 -84.32000 30.90000 US1GADK0015
8 -84.07456 33.89125 US1GADK0015
是否有替代mutate()
来电内的索引?
注意,尽量避免一次创建整个距离矩阵。我希望计算rowwise将需要更少的资源
此外,在SO上有几个类似的问题,但是没有使用dplyr
。希望弄清楚为什么rowwise%&gt;%mutate的这种应用不会产生预期的结果。
答案 0 :(得分:2)
看看这是否适合你(数据的眼球说它确实如此):
library(tidyverse)
library(microbenchmark)
library(zipcode)
stat_df <- read_fwf(
"ghcnd-stations.txt",
fwf_widths(widths=c(11, 1, 8, 1, 9, 1, 6, 1, 2, 1, 30, 1, 3, 1, 3, 1, 5))
)
stations <- select(stat_df, station_id = X1, latitude = X3, longitude = X5)
closest_station <- function(lat, lon) {
index <- which.min(sqrt((stations$latitude-lat)^2 + (stations$longitude-lon)^2)) # less precise but likely good enough
stations[index,]$station_id
}
data(zipcode)
zipcode <- tbl_df(zipcode)
zipcode
set.seed(1492)
smpl <- zipcode[sample(nrow(zipcode), 100),]
mutate(smpl, station_id = map2_chr(latitude, longitude, closest_station))
## # A tibble: 100 x 6
## zip city state latitude longitude station_id
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 28137 Richfield NC 35.49326 -80.25524 US1NCSN0006
## 2 22027 Dunn Loring VA 38.89392 -77.21976 US1VAFX0064
## 3 19080 Wayne PA 40.04320 -75.35768 US1PADL0005
## 4 12459 New Kingston NY 42.22799 -74.68912 USC00305743
## 5 06082 Enfield CT 41.98724 -72.56365 US1CTHR0005
## 6 01302 Greenfield MA 42.52218 -72.62416 USC00193295
## 7 83540 Lapwai ID 46.39708 -116.78649 USC00105132
## 8 49266 Osseo MI 41.84489 -84.55244 USC00203823
## 9 37871 Strawberry Plains TN 36.04051 -83.67934 USC00408677
## 10 75042 Garland TX 32.91562 -96.67399 US1TXDA0065
## # ... with 90 more rows
我认为这个数学是正确的,但必须喷射......
microbenchmark(mutate(smpl, station_id = map2_chr(latitude, longitude, closest_station)))
# !300ms for 100
# ((nrow(zipcode)/100) * 300) / 1000 / 60 == ~3m