我一直在尝试用Rvest刮擦一个桌子的网站。 目前,我可以访问表,但是它仅提取表的标题,而不提取内容的标题。我还没有弄清楚如何完成此问题的解决,但是我尝试了一些类似的解决方案。 (How to get table using rvest())
我已经弄清楚了如何在没有适当的唯一标识符的情况下提交到单个按钮。完成:How to submit login form in Rvest package w/o button argument,https://github.com/hadley/rvest/issues/156
这是我的操作脚本:
library(rvest)
library(httr)
library(R.utils)
url<-'https://itmdapps.milwaukee.gov/publicApplication_QD/zipcode.jsp'
##############################
#fix for package available online. https://github.com/hadley/rvest/issues/156
custom.submit_request <-
function (form, submit = NULL)
{
submits <- Filter(function(x) {
identical(tolower(x$type), "submit")
}, form$fields)
nsubmits <- Filter(function(x) {
!identical(tolower(x$type), "submit")
}, form$fields)
# if list take name and vakue as inputs
if (is.list(submit)) {
submits[[1]]$name <- names(submit)[1]
submits[[1]]$value <- submit[[1]]
submit <- submits[[1]]
}
# if character filter by name
if (is.character(submit)){
submit <- Filter(function(x){x$name==submit},submits)[[1]]
}
# if null choose first
if (is.null(submit)) {
submit <- submits[[1]]
message("Submitting with '", submit$name, "'")
}
# handle method
method <- form$method
if (!(method %in% c("POST", "GET"))) {
warning("Invalid method (", method, "), defaulting to GET",
call. = FALSE)
method <- "GET"
}
# url
url <- form$url
# fields
fields <- nsubmits
fields[submit$name] <- list(submit)
fields <- Filter(function(x) length(x$value) > 0, fields)
values <- rvest::pluck(fields, "value")
names(values) <- names(fields)
# return
list(
method = method,
encode = form$enctype,
url = url,
values = values
)
}
reassignInPackage('submit_request', 'rvest', custom.submit_request)
#####################################
target_zip_code_position<-2
webpage.session <- html_session(url) #start website
form<-html_form(webpage.session) #here's the form.
#log in!
form #let's look at it. We have to log in!
filled_form<-form #create a copy to fill so we don't ruin the original.
filled_form[[2]]<-set_values(filled_form[[2]],
username = "address",
password = "user") #fill forms
filled_form #how does it look?
#filled_form[[2]]$url<-"" #URL needs to be cleared to prevent error message when submitting.
logged_in.session<-submit_form(session = webpage.session,
form = filled_form[[2]]) # defaults to first submission button with message.
#We have successfully logged in.
zip_search.session<-jump_to(logged_in.session,url) #navigate to the page with the query we want.
zip_search.form<-html_form(zip_search.session)
zip_search.form_filled<-zip_search.form
zip_search.form_filled[[2]]<-set_values(zip_search.form_filled[[2]],
zipcode = target_zip_code_position,
format = 1,
startDate = "01/01/2005",
endDate = "01/01/2006"
)
list_submit<-list('WIBR Detailed')
names(list_submit)<-c('submit') #Very bizzare submit approach.
output.session<- submit_form(session = zip_search.session,
form = zip_search.form_filled[[2]],
submit = list_submit
) #how does it know which one? Requires fancy submit technique here. Now works.
### we have sent a query
#output.read_html<-read_html(output.session)
#output_table<-html_table(output.session, fill = TRUE)[[1]] #no rows.
table_node<-html_node(output.session,'div.main div.content:nth-child(5) table.bordered:nth-child(1)')
html_text(table_node) #no rows still. Only selects proper table.
html_table(table_node)