我正在尝试通过网络抓取此webpage。我运行以下代码来检查我是否能够获取年份和按州分类的数据集。 我能够刮刮州名称三遍,但无法获得与这些观测值相关的年份。 “值”列对我而言没有意义。
我的目标是能够在所有下拉菜单中提取与每个唯一值关联的表。
感谢您的帮助。 我已经尝试了一段时间,但没有成功。
library(RSelenium)
library(rvest)
library(tidyverse)
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100",
port = 4445L,
browser = "chrome")
driver<- rsDriver(browser=c("chrome"))
remDr= driver[["client"]]
remDr$open()
remDr$navigate("http://agcensus.dacnet.nic.in/districtsummarytype.aspx")
remDr$getPageSource()[[1]]
htmlParse(remDr$getPageSource()[[1]])
years <- xml2::read_html(remDr$getPageSource()[[1]]) %>%
rvest::html_nodes("#_ctl0_ContentPlaceHolder1_ddlYear") %>%
rvest::html_children() %>%
rvest::html_text() %>%
dplyr::data_frame(Year = .)
years <- years %>%
dplyr::mutate(list_position = 1:3,
x = stringr::str_c("_ctl0_ContentPlaceHolder1_ddlYear >
option:nth-child(",list_position, ")"))
v1 <- years$Year
lst <- vector("list", length(v1))
for(i in seq_along(lst)) {
remDr$findElement("id",
"_ctl0_ContentPlaceHolder1_ddlYear")$sendKeysToElement(list(v1[i]))
elem <- remDr$findElement(using="id",
value='_ctl0_ContentPlaceHolder1_ddlState')
elemtxt <- elem$getElementAttribute("outerHTML")[[1]]
elemxml <- htmlTreeParse(elemtxt, useInternalNodes=TRUE)
key <- xpathSApply(elemxml, "//body//option", xmlValue)[]
value <- unlist(xpathSApply(elemxml, "//body//option", xmlAttrs)[])
if(length(value)==1 & "--Select Crop--" %in% value) {
lst[[i]] <- NULL
} else lst[[i]] <- data.frame(key, value, stringsAsFactors = FALSE)
}
res <- do.call(rbind, lst)
head (res)
# key value
#1 A & N ISLANDS 23a
#2 ANDHRA PRADESH 1a
#3 ARUNACHAL PRADESH 24a
#4 ASSAM 2a
#5 BIHAR 3a
#6 CHANDIGARH 31a