我正在尝试使用rvest从USGS帖子表格下载数据。我做错了什么?
make_url = function(base_url, parameter_list)
parameter_list %>%
names %>%
paste(parameter_list, sep = "=", collapse = "&") %>%
paste(base_url, ., sep = "")
session =
list(sn = "01170000") %>%
make_url("http://ida.water.usgs.gov/ida/available_records.cfm?", .) %>%
html_session
test =
session %>%
html_form %>%
.[[1]] %>%
set_values(fromdate = "1990-10-01") %>%
set_values(todate = "2007-09-30") %>%
set_values(rtype = "3") %>%
submit_form(session, .)
答案 0 :(得分:2)
无需rvest
或会话。以下函数将接收工作站和日期,并返回一个数据框,其中包含USGS每次下载时吐出的数据文件注释。
它使用“下载压缩文件”选项来节省带宽并加快下载速度。它使临时文件读取数据,但在自身后清理。列将转换为正确的类型(如果您愿意,可以省略代码的这一部分)。如果您不需要,也可以省略附加注释(它似乎对我有用的信息)。
readr::read_lines()
用于提高速度,如果您不想依赖readLines()
包,则可以使用readr
。
转换为tibble
版data.frame
主要是为了更好的打印,但它有其他潜在的优势,所以如果你不想依赖{ {1}}包。
有99秒的硬编码超时,但如果需要,您可以对其进行参数化。
tibble
证明它有效:
library(httr)
library(readr)
library(tibble)
#' Retrieve IDA Station Data
#'
#' @param site_no site id
#' @param date_from records from date YYYY-mm-dd
#' @param date_to records to date YYYY-mm-dd
#' @return a parsed, type-converted data frame with a comments attribute.
#' @example
#' deerfield <- get_ida("01170000", "1990-10-01", "2007-09-30")
#'
#' head(deerfield)
#'
#' cat(comment(deerfield))
get_ida <- function(site_no, date_from, date_to) {
date_from_time <- sprintf("%s 00:15:00.0", date_from)
date_to_time <- sprintf("%s 23:45:00.0", date_to)
ida_referer <- sprintf("http://ida.water.usgs.gov/ida/available_records.cfm?sn=%s", site_no)
tf <- tempfile(".zip")
res <- POST(url = "http://ida.water.usgs.gov/ida/available_records_process.cfm",
body = list(fromdate = date_from,
todate = date_to,
mindatetime = date_from_time,
maxdatetime = date_to_time,
site_no = site_no,
rtype = "2",
submit1 = "Retrieve+Data"),
add_headers(Origin="http://ida.water.usgs.gov",
Referer=ida_referer),
write_disk(tf),
timeout(99),
encode = "form")
fils <- unzip(tf, exdir=tempdir())
tmp <- read_lines(fils)
unlink(tf)
unlink(fils)
comments <- grep("^#", tmp, value=TRUE)
records <- grep("^#", tmp, value=TRUE, invert=TRUE)
header <- records[1:2]
records <- records[-(1:2)]
cols <- strsplit(header[1], "[[:space:]]+")[[1]]
comments <- paste0(comments, collapse="\n")
records <- paste0(records, collapse="\n")
df <- read_tsv(records, col_names=cols, "cccnnnnc")
df$date_time <- as.POSIXct(df$date_time, format="%Y%m%d%H%M%S")
df <- as_tibble(df)
comment(df) <- comments
df
}
答案 1 :(得分:0)
好的,这是一种让rvest工作的方法:
library(magrittr)
make_url = function(base_url, parameter_list = list(), ...) {
together_list =
parameter_list %>%
c(list(...) )
together_list %>%
names %>%
paste(together_list, sep = "=", collapse = "&") %>%
paste(base_url, ., sep = "?")
}
download_ida = function(site_no,
fromdate = "1990-10-01",
todate = "2007-09-30",
dir = ".",
filename = paste(site_no, "txt", sep = ".") ) {
session =
"http://ida.water.usgs.gov/ida/available_records.cfm" %>%
make_url(sn = "01170000") %>%
html_session
form =
session %>%
html_form %>%
.[[1]] %>%
set_values(fromdate = fromdate,
todate = todate,
rtype = "2")
tempfile = tempfile(".zip")
submit_form(session, form, submit = NULL,
httr::write_disk(tempfile,
overwrite = TRUE),
httr::add_headers(Referer = session$url) )
filename = file.path(dir, filename)
tempfile %>%
unzip(exdir = dir) %>%
file.rename(filename)
filename
}
read_ida = function(filename) {
col_names =
filename %>%
readr::read_tsv(comment = "#", n_max = 1, col_names = FALSE)
filename %>%
readr::read_tsv(comment = "#", skip= 2, col_names = FALSE, na = "Ice",
col_types = cols(X2 = col_datetime(format = "%Y%m%d%H%M%S"))) %>%
stats::setNames(col_names)
}
deerfield =
"01170000" %>%
download_ida %>%
read_ida
但是有一点需要注意:rvest目前有一个开放式拉取请求https://github.com/hadley/rvest/pull/161,这需要让它发挥作用。为此,有必要重新定义submit_request和submit_form,以整合新的拉取请求:
submit_request = function(form, submit = NULL) {
is_submit <- function(x)
if ( is.null(x$type) ) FALSE else
tolower(x$type) %in% c("submit", "image", "button")
submits <- Filter(is_submit, form$fields)
if (length(submits) == 0) {
stop("Could not find possible submission target.", call. = FALSE)
}
if (is.null(submit)) {
submit <- names(submits)[[1]]
message("Submitting with '", submit, "'")
}
if (!(submit %in% names(submits))) {
stop("Unknown submission name '", submit, "'.\n", "Possible values: ",
paste0(names(submits), collapse = ", "), call. = FALSE)
}
other_submits <- setdiff(names(submits), submit)
method <- form$method
if (!(method %in% c("POST", "GET"))) {
warning("Invalid method (", method, "), defaulting to GET",
call. = FALSE)
method <- "GET"
}
url <- form$url
fields <- form$fields
fields <- Filter(function(x) length(x$value) > 0, fields)
fields <- fields[setdiff(names(fields), other_submits)]
values <- pluck(fields, "value")
names(values) <- names(fields)
list(method = method, encode = form$enctype, url = url, values = values)
}
submit_form = function(session, form, submit = NULL, ...) {
request <- submit_request(form, submit)
url <- xml2::url_absolute(form$url, session$url)
if (request$method == "GET") {
rvest:::request_GET(session, url = url, query = request$values, ...)
} else if (request$method == "POST") {
rvest:::request_POST(session, url = url, body = request$values,
encode = request$encode, ...)
} else {
stop("Unknown method: ", request$method, call. = FALSE)
}
}
希望拉动请求很快就会合并。