有没有办法废弃here的所有坐标?
我知道我必须这样做:
library(rvest)
library(stringi)
url <- "http://www.imo.org/en/OurWork/Environment/PollutionPrevention/AirPollution/Pages/Emission-Control-Areas-%28ECAs%29-designated-under-regulation-13-of-MARPOL-Annex-VI-%28NOx-emission-control%29.aspx"
page <- html(url)
coords <- page %>% html_nodes(".") %>% html_text()
但我不知道如何找到放入html_nodes的内容。
我正在试图运行萤火虫以便找出它,但它是一团糟(我没有任何经验,但网络废料或使用萤火虫)。
答案 0 :(得分:1)
页面上的表格结构很差,而不是每个数据点都有行,标题有一行,每个数据点的所有数据都有一行作为新段落分隔。以下代码应该可以将此数据转换为数据帧列表。
library(rvest)
# Set URL
url <- "http://www.imo.org/en/OurWork/Environment/PollutionPrevention/AirPollution/Pages/Emission-Control-Areas-%28ECAs%29-designated-under-regulation-13-of-MARPOL-Annex-VI-%28NOx-emission-control%29.aspx"
# Get webpage
webpage <- read_html(url)
# Create empty list to hold dataframes
ldf <- list()
# Get list of tables
tables <- webpage %>%
html_nodes("table.ms-rteTable-default")
# Loop through tables
for(t in 1:length(tables)){
# Get table data
table.data <- tables[[t]] %>%
html_nodes("td")
# Extract points from table.data
points <- table.data[4] %>%
html_nodes("p") %>%
html_text()
# Extract latitudes from table.data
lats <- table.data[5] %>%
html_nodes("p") %>%
html_text() %>%
sub(pattern = "″", replacement = "\"")
# Extract longitudes from table.data
lons <- table.data[6] %>%
html_nodes("p") %>%
html_text() %>%
sub(pattern = "″", replacement = "\"")
# Add dataframe to the list
ldf[[t]] <- data.frame(Point = points, Latitude = lats, Longitude = lons, stringsAsFactors = FALSE)
}
# Print list of dataframes
ldf
答案 1 :(得分:1)
略有不同的方法:
library(sp)
library(rvest)
library(stringi)
library(hrbrthemes)
library(tidyverse)
target_url <- "http://www.imo.org/en/OurWork/Environment/PollutionPrevention/AirPollution/Pages/Emission-Control-Areas-%28ECAs%29-designated-under-regulation-13-of-MARPOL-Annex-VI-%28NOx-emission-control%29.aspx"
pg <- read_html(target_url)
现在我们有了页面,我们需要获取正确的元素,但坐标的格式很难使用,因此我们将使用辅助函数转换它们:
dms_to_dec <- function(x) {
html_text(x) %>%
stri_replace_first_regex("º ", "d") %>%
stri_replace_first_regex("′ ", "'") %>%
stri_replace_first_regex("″", "") %>%
stri_replace_all_regex("[ \\.]", "") %>%
char2dms() %>%
as.numeric.DMS()
}
现在,我们定位每个表,但是将{em>疯狂存储的各个数据元素(每个)拉出<td>
个<p>
个html_nodes(pg, "table.ms-rteTable-default") %>%
map_df(~{
data_frame(
point = html_nodes(.x, xpath=".//td[1]/p") %>% xml_double(),
latitude = html_nodes(.x, xpath=".//td[2]/p") %>% dms_to_dec(),
longitude = html_nodes(.x, xpath=".//td[3]/p") %>% dms_to_dec()
)
}, .id = "table_num") -> regions
包裹的每个数据元素。我们将它们拉出来然后制作一个数据框,使用表#作为列组。
group_by(regions, table_num) %>%
summarise(n_points = n())
## # A tibble: 8 x 2
## table_num n_points
## <chr> <int>
## 1 1 47
## 2 2 206
## 3 3 45
## 4 4 55
## 5 5 47
## 6 6 206
## 7 7 45
## 8 8 55
让我们来看看:
ggplot(regions, aes(longitude, latitude, group=table_num)) +
geom_path(aes(color=table_num)) +
ggthemes::scale_color_tableau() +
coord_map("polyconic") +
theme_ipsum_rc(grid="XY")
而且,更好的&#34;外观&#34;:
library(rgdal)
usa <- readOGR("http://eric.clst.org/wupl/Stuff/gz_2010_us_outline_500k.json")
usa_map <- fortify(subset(usa, R_STATEFP != "02" & L_STATEFP != "02"))
ggplot() +
geom_map(data=usa_map, map=usa_map, aes(x=long, y=lat, map_id=id), color="#2b2b2b", size=0.15, fill="white") +
geom_path(data=regions, aes(x=longitude, y=latitude, group=table_num, color=table_num)) +
ggthemes::scale_color_tableau() +
coord_map(xlim=c(-180, -47)) +
theme_ipsum_rc(grid="XY")
看起来也是正确的:
{{1}}
答案 2 :(得分:0)
html_nodes(&#34; table.ms-rteTable-default&#34;)%&gt;%html_table()