我是网络爬虫的新手,我想获取此网页的数据:http://www.neotroptree.info/data/countrysearch
在此链接中,我们看到四个字段(“国家/地区”,“域”,“州”和“站点”)。
我有一个带有网站名称的数据框,我使用以下代码进行了抓取:
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(c("rgdal", "tidyverse"))
#> Loading required package: rgdal
#> Loading required package: sp
#> rgdal: version: 1.3-4, (SVN revision 766)
#> Geospatial Data Abstraction Library extensions to R successfully loaded
#> Loaded GDAL runtime: GDAL 2.2.2, released 2017/09/15
#> Path to GDAL shared files: /usr/share/gdal/2.2
#> GDAL binary built with GEOS: TRUE
#> Loaded PROJ.4 runtime: Rel. 4.9.2, 08 September 2015, [PJ_VERSION: 492]
#> Path to PROJ.4 shared files: (autodetected)
#> Linking to sp version: 1.3-1
#> Loading required package: tidyverse
#> rgdal tidyverse
#> TRUE TRUE
download.file(url = "http://www.neotroptree.info/files/Neotropicos.kmz",
destfile = "neotroptree-site.kmz",
quiet = FALSE)
rgdal::ogrListLayers("neotroptree-site.kmz")
#> [1] "Neotropicos"
#> [2] "Jubones, Quito, Pichincha, Ecuador"
#> attr(,"driver")
#> [1] "LIBKML"
#> attr(,"nlayers")
#> [1] 2
ntt <- rgdal::readOGR("neotroptree-site.kmz", "Neotropicos")
#> OGR data source with driver: LIBKML
#> Source: "/tmp/Rtmppf54qE/neotroptree-site.kmz", layer: "Neotropicos"
#> with 7504 features
#> It has 11 fields
ntt.df <- data.frame(site = ntt@data$Name,
long = ntt@coords[, 1],
lat = ntt@coords[, 2]) %>%
.[order(.$site), ] %>%
mutate(., ID = rownames(.)) %>%
mutate(., site = as.character(site))
ntt.df <- ntt.df[, c("ID", "site", "long", "lat")]
glimpse(ntt.df)
#> Observations: 7,504
#> Variables: 4
#> $ ID <chr> "2618", "2612", "3229", "2717", "2634", "4907", "3940", "...
#> $ site <chr> "Abadia, cerrado", "Abadia, floresta semidecidual", "Abad...
#> $ long <dbl> -43.15000, -43.10667, -48.72250, -45.52493, -45.27417, -4...
#> $ lat <dbl> -17.690000, -17.676944, -16.089167, -19.111667, -19.26638...
手动,我需要:
我的第一个尝试是使用rvest
包,但无法在网页内找到表单字段。
if(!require("rvest")) install.packages("rvest")
#> Loading required package: rvest
#> Loading required package: xml2
url <- "http://www.neotroptree.info/data/countrysearch"
webpage <- html_session(url)
webpage %>%
html_form()
#> list()
有没有想到如何重复此过程?
答案 0 :(得分:4)
RSelenium,decapitated
和splashr
都引入了第三方依赖,这些第三方依赖可能很难设置和维护。
这里不需要浏览器工具,因此不需要RSelenium。 decapitated
并没有多大帮助,splashr
对于这种用例来说有点过头了。
您在网站上看到的表单是Solr数据库的代理。在浏览器上打开浏览器,在该URL上单击刷新,然后查看“网络”部分的“ XHR”部分。您会看到它在加载时以及每次表单交互时发出异步请求。
我们要做的就是模仿这些互动。下面的来源带有大量注释,您可能需要手动浏览它们,以了解底层情况。
我们需要一些帮助:
library(xml2)
library(curl)
library(httr)
library(rvest)
library(stringi)
library(tidyverse)
大多数{^ 1}都在您加载rvest
时加载,但我喜欢明确。此外,对于stringr
函数来说,stringi
对于更明确的操作来说是不必要的拐杖,因此我们将使用它们。
首先,我们获得网站列表。该函数模仿了POST
请求,当您接受使用开发人员工具的建议以了解正在发生的情况时,您希望看到它。
get_list_of_sites <- function() {
# This is the POST reques the site makes to get the metdata for the popups.
# I used http://gitlab.com/hrbrmstr/curlconverter to untangle the monstosity
httr::POST(
url = "http://www.neotroptree.info/data/sys/scripts/solrform/solrproxy.php",
body = list(
q = "*%3A*",
host = "padme.rbge.org.uk",
c = "neotroptree",
template = "countries.tpl",
datasetid = "",
f = "facet.field%3Dcountry_s%26facet.field%3Dstate_s%26facet.field%3Ddomain_s%26facet.field%3Dsitename_s"
),
encode = "form"
) -> res
httr::stop_for_status(res)
# extract the returned JSON from the HTML document it returns
xdat <- jsonlite::fromJSON(html_text(content(res, encoding="UTF-8")))
# only return the site list (the xdat structure had alot more in it tho)
discard(xdat$facets$sitename_s, stri_detect_regex, "^[[:digit:]]+$")
}
我们在下面称呼它,但它只返回站点名称的字符向量。
现在,我们需要一个函数来获取表单输出下部中返回的站点数据。此操作与上述操作相同,不同之处在于它增加了访问网站以及将文件存储在何处的功能。 overwrite
非常方便,因为您可能正在进行大量下载,然后尝试再次下载相同的文件。由于我们使用httr::write_disk()
保存文件,因此将此参数设置为FALSE
会导致异常并停止任何循环/迭代。您可能不想这么做。
get_site <- function(site, dl_path, overwrite=TRUE) {
# this is the POST request the site makes as an XHR request so we just
# mimic it with httr::POST. We pass in the site code in `q`
httr::POST(
url = "http://www.neotroptree.info/data/sys/scripts/solrform/solrproxy.php",
body = list(
q = sprintf('sitename_s:"%s"', curl::curl_escape(site)),
host = "padme.rbge.org.uk",
c = "neotroptree",
template = "countries.tpl",
datasetid = "",
f = "facet.field%3Dcountry_s%26facet.field%3Dstate_s%26facet.field%3Ddomain_s%26facet.field%3Dsitename_s"
),
encode = "form"
) -> res
httr::stop_for_status(res)
# it returns a JSON structure
xdat <- httr::content(res, as="text", encoding="UTF-8")
xdat <- jsonlite::fromJSON(xdat)
# unfortunately the bit with the site-id is in HTML O_o
# so we have to parse that bit out of the returned JSON
site_meta <- xml2::read_html(xdat$docs)
# now, extract the link code
link <- html_attr(html_node(site_meta, "div.solrlink"), "data-linkparams")
link <- stri_replace_first_regex(link, "code_s:", "")
# Download the file and get the filename metadata back
xret <- get_link(link, dl_path) # the code for this is below
# add the site name
xret$site <- site
# return the list
xret[c("code", "site", "path")]
}
我将用于检索文件的代码放入一个单独的函数中,因为将这个功能封装到一个单独的函数中似乎很有意义。 YMMV。我也删除了文件名中的荒谬的,
。
get_link <- function(code, dl_path, overwrite=TRUE) {
# The Download link looks like this:
#
# <a href="http://www.neotroptree.info/projectfiles/downloadsitedetails.php?siteid=AtlMG104">
# Download site details.
# </a>
#
# So we can mimic that with httr
site_tmpl <- "http://www.neotroptree.info/projectfiles/downloadsitedetails.php?siteid=%s"
dl_url <- sprintf(site_tmpl, code)
# The filename comes in a "Content-Disposition" header so we first
# do a lightweight HEAD request to get the filename
res <- httr::HEAD(dl_url)
httr::stop_for_status(res)
stri_replace_all_regex(
res$headers["content-disposition"],
'^attachment; filename="|"$', ""
) -> fil_name
# commas in filenames are a bad idea rly
fil_name <- stri_replace_all_fixed(fil_name, ",", "-")
message("Saving ", code, " to ", file.path(dl_path, fil_name))
# Then we use httr::write_disk() to do the saving in a full GET request
res <- httr::GET(
url = dl_url,
httr::write_disk(
path = file.path(dl_path, fil_name),
overwrite = overwrite
)
)
httr::stop_for_status(res)
# return a list so we can make a data frame
list(
code = code,
path = file.path(dl_path, fil_name)
)
}
现在,我们获得了站点列表(如所承诺的那样):
# get the site list
sites <- get_list_of_sites()
length(sites)
## [1] 7484
head(sites)
## [1] "Abadia, cerrado"
## [2] "Abadia, floresta semidecídua"
## [3] "Abadiânia, cerrado"
## [4] "Abaetetuba, Rio Urubueua, floresta inundável de maré"
## [5] "Abaeté, cerrado"
## [6] "Abaeté, floresta ripícola"
我们将抓取一个站点ZIP文件:
# get one site link dl
get_site(sites[1], "/tmp")
## $code
## [1] "CerMG044"
##
## $site
## [1] "Abadia, cerrado"
##
## $path
## [1] "/tmp/neotroptree-CerMG04426-09-2018.zip"
现在,获取更多信息,并返回包含代码,站点和保存路径的数据框:
# get a few (remomove [1:2] to do them all but PLEASE ADD A Sys.sleep(5) into get_link() if you do!)
map_df(sites[1:2], get_site, dl_path = "/tmp")
## # A tibble: 2 x 3
## code site path
## <chr> <chr> <chr>
## 1 CerMG044 Abadia, cerrado /tmp/neotroptree-CerMG04426-09-20…
## 2 AtlMG104 Abadia, floresta semidecídua /tmp/neotroptree-AtlMG10426-09-20…
如果要进行大量下载,请注意将Sys.sleep(5)
添加到get_link()
中的指导。 CPU,内存和带宽不是免费的,站点可能并没有真正扩展服务器来满足大约8000个背对背多HTTP请求调用序列的限制,并在文件末尾进行了下载。>
答案 1 :(得分:1)
@hrbrmstr遵循您的指南,我可以使用python做到这一点。我在硒方面苦苦挣扎。 我还添加了一个日志,以便于从中间重新开始,并添加了一个递归循环来下载文件,并在错误持续存在时重试,这很好地确保了所有站点都将出现在结果中,当然打算完全复制数据库。 这是代码,以防万一有人想知道它在python中的作用
import requests
import os
from time import sleep
from subprocess import call
out_dir = '/home/rupestre/tree_scrap/data/'
def read_log(out_dir):
log_path = os.path.join(out_dir,'log.txt')
site_names = []
if os.path.isfile(log_path):
with open(log_path) as log:
for line in log:
site_names.append(line.strip())
return site_names
def save_log(site_name,out_dir):
log_path = os.path.join(out_dir,'log.txt')
call('echo "{}\n" >> {}'.format(site_name,log_path),shell=True)
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
def get_sites():
url = "http://www.neotroptree.info/data/sys/scripts/solrform/solrproxy.php"
data = {
'q' : "*%3A*",
'host' : "padme.rbge.org.uk",
'c' : "neotroptree",
'template' : "countries.tpl",
'datasetid' : "",
'f' : "facet.field%3Dcountry_s%26facet.field%3Dstate_s%26facet.field%3Ddomain_s%26facet.field%3Dsitename_s"
}
x = requests.post(url,data,headers={'accept': 'application/json'})
response = x.json()
sites = response['facets']['sitename_s']
sites = [x for x in sites if x != 1]
return sites
def get_site_code(site):
data = {
'q' : 'sitename_s:"{}"'.format(site),
'host' : "padme.rbge.org.uk",
'c' : "neotroptree",
'template' : "countries.tpl",
'datasetid' : "",
'f' : "facet.field%3Dcountry_s%26facet.field%3Dstate_s%26facet.field%3Ddomain_s%26facet.field%3Dsitename_s"
}
url = "http://www.neotroptree.info/data/sys/scripts/solrform/solrproxy.php"
x = requests.post(url,data,headers={'accept': 'application/json'})
#print(x.status_code)
response = x.json()
site_data = response['docs']
site_code = site_data.split("data-linkparams=")[1].split('"')[1].split('code_s:')[1]
return site_code
def get_filename(site_code):
url_code = "http://www.neotroptree.info/projectfiles/downloadsitedetails.php?siteid={}".format(site_code)
x = requests.head(url_code)
filename = x.headers['Content-Disposition'].split('"')[-2]
return filename,url_code
def main():
log = read_log(out_dir)
sites = get_sites()
for site in sites:
if site in log:
continue
download = False
while not download:
try:
print(site)
site_code = get_site_code(site)
filename,url_code = get_filename(site_code)
local_file = os.path.join(out_dir,filename)
if not os.path.isfile(local_file):
download_url(url_code,local_file)
save_log(site,out_dir)
download=True
print(filename)
sleep(5)
else:
save_log(site,out_dir)
download=True
print(filename)
sleep(5)
except Exception as e:
print(e)
sleep(5)
if __name__ == '__main__':
main()