我正在使用RSelenium从具有动态形式的[website] [1]中删除数据,其中多个下拉菜单根据所选内容而变化。我试图拉变量'Number&每个州每个地区的“运营控股区”。
我能够使代码正常工作,但是当分区没有表时(网站数据库有一些没有数据的区域)会出现问题。当我的代码进入没有数据的区域时,它就会完成,我留下了一个不完整的数据集。
我如何创建一个可以跳过缺少表的区域的代码?我的代码粘贴在下面。当我调整他们的代码时,一个特殊的喊叫就会转到前面的堆栈交换线程,[link here] [2]。此外,如果任何人都可以清理我的最终输出,以避免重复每个新区的变量标题,我们将不胜感激。
RM(列表= LS(所有= TRUE))
library(RSelenium)
library(XML)
library(dplyr)
library(magrittr)
library(devtools)
library(rvest)
# Start Selenium Server --------------------------------------------------------
checkForServer()
startServer()
remDrv <- remoteDriver()
remDrv$open()
# Simulate browser session and fill out form -----------------------------------
remDrv$navigate('http://agcensus.dacnet.nic.in/districtsummarytype.aspx')
# Select year
remDrv$findElement(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList2']/option[@value = '2010']")$clickElement()
# Select 1 == Number & Area of Operational Holdings
remDrv$findElement(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList3']/option[@value = '1']")$clickElement()
# Select 4 == All Social Group
remDrv$findElement(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList4']/option[@value = '4']")$clickElement()
# Select 3 == All Gender (Total)
remDrv$findElement(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList8']/option[@value = '3']")$clickElement()
# Get all state IDs and the respective names
state_IDs <- remDrv$findElements(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
state_names <- remDrv$findElements(using = "xpath",
"//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
# Retrieve and download results ------------------------------------------------
result <- data.frame(state = character(), district = character(),
V1 = character(), V2 = character(), V3 = character(),
V4 = character(), V5 = character(), V6 = character(),
V7 = character(), V8 = character(), V9 = character(),
V10 = character(), V11 = character(), V12 = character())
for (i in seq_along(state_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option[@value = ",
"'", state_IDs[i], "']"))$clickElement()
Sys.sleep(2)
# Get all district IDs and names from the currently selected states
district_IDs <- remDrv$findElements(using = "xpath",
"//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementAttribute('value')}) %>%
unlist
district_names <- remDrv$findElements(using = "xpath",
"//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
lapply(function(x){x$getElementText()}) %>%
unlist
for (j in seq_along(district_IDs)) {
remDrv$findElement(using = "xpath",
paste0("//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option[@value = ",
"'", district_IDs[j], "']"))$clickElement()
Sys.sleep(2)
# Click submit and download data of the selected district
remDrv$findElement(using = "xpath",
"//input[@value = 'Submit']")$clickElement()
Sys.sleep(2)
######### if ##########
if (remDrv$findElement("xpath", "//input[@value ='No Records found'")) { #this isnt input value, but rather a "No Records found" lookup
remDrv$goBack()
Sys.sleep(2)
}
else {
# Download data for current district
district_data <- remDrv$getPageSource()[[1]] %>%
htmlParse %>%
readHTMLTable %>%
extract2(4) %>%
extract(c(-1, -2), )
result <- data.frame(state = state_names[i], district = district_names[j],
district_data) %>% rbind(result, .)
remDrv$goBack()
Sys.sleep(2)
}
}
}
remDrv$quit()
remDrv$closeServer()
result %<>% as_data_frame %>%
rename(
si_no = V1,
holding_size = V2,
Individual_Number = V3,
Individual_Area = V4,
Joint_Number = V5,
Joint_Area = V6,
Subtotal_Number = V7,
Subtotal_Area = V8,
Institutional_Number = V9,
Institutional_Area = V10,
Total_Number = V11,
Total_Area = V12
) %>%
mutate(
si_no = as.numeric(as.character(si_no))
)
str(result)
levels(result$state)
levels(result$district)