R - web抓取动态表单跳过丢失的数据

时间:2015-10-04 00:36:17

标签: r web-scraping rselenium

我正在使用RSelenium从具有动态形式的[website] [1]中删除数据,其中多个下拉菜单根据所选内容而变化。我试图拉变量'Number&每个州每个地区的“运营控股区”。

我能够使代码正常工作,但是当分区没有表时(网站数据库有一些没有数据的区域)会出现问题。当我的代码进入没有数据的区域时,它就会完成,我留下了一个不完整的数据集。

我如何创建一个可以跳过缺少表的区域的代码?我的代码粘贴在下面。当我调整他们的代码时,一个特殊的喊叫就会转到前面的堆栈交换线程,[link here] [2]。此外,如果任何人都可以清理我的最终输出,以避免重复每个新区的变量标题,我们将不胜感激。

RM(列表= LS(所有= TRUE))

library(RSelenium)
library(XML)
library(dplyr)
library(magrittr)
library(devtools)
library(rvest)

# Start Selenium Server --------------------------------------------------------

checkForServer()
startServer()
remDrv <- remoteDriver()
remDrv$open()


# Simulate browser session and fill out form -----------------------------------

remDrv$navigate('http://agcensus.dacnet.nic.in/districtsummarytype.aspx')

# Select year
remDrv$findElement(using = "xpath", 
                   "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList2']/option[@value = '2010']")$clickElement()

# Select 1 == Number & Area of Operational Holdings
remDrv$findElement(using = "xpath",
                   "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList3']/option[@value = '1']")$clickElement()

# Select 4 == All Social Group 
remDrv$findElement(using = "xpath",
                   "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList4']/option[@value = '4']")$clickElement()

# Select 3 == All Gender (Total) 
remDrv$findElement(using = "xpath",
                   "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList8']/option[@value = '3']")$clickElement()

# Get all state IDs and the respective names
state_IDs <- remDrv$findElements(using = "xpath",
                                 "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
  lapply(function(x){x$getElementAttribute('value')}) %>% 
  unlist

state_names <- remDrv$findElements(using = "xpath",
                                   "//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option") %>%
  lapply(function(x){x$getElementText()}) %>% 
  unlist


# Retrieve and download results ------------------------------------------------

result <- data.frame(state = character(), district = character(), 
                     V1 = character(), V2 = character(), V3 = character(),
                     V4 = character(), V5 = character(), V6 = character(),
                     V7 = character(), V8 = character(), V9 = character(),
                     V10 = character(), V11 = character(), V12 = character())

for (i in seq_along(state_IDs)) {

  remDrv$findElement(using = "xpath",
                     paste0("//select[@name = '_ctl0:ContentPlaceHolder1:DropDownList1']/option[@value = ", 
                            "'", state_IDs[i], "']"))$clickElement()
  Sys.sleep(2)

  # Get all district IDs and names from the currently selected states
  district_IDs <- remDrv$findElements(using = "xpath",
                                      "//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
    lapply(function(x){x$getElementAttribute('value')}) %>%
    unlist

  district_names <- remDrv$findElements(using = "xpath",
                                        "//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option") %>%
    lapply(function(x){x$getElementText()}) %>%
    unlist


  for (j in seq_along(district_IDs)) {

    remDrv$findElement(using = "xpath",
                       paste0("//div[@id = '_ctl0_ContentPlaceHolder1_Panel14']/select/option[@value = ",
                              "'", district_IDs[j], "']"))$clickElement()
    Sys.sleep(2)

    # Click submit and download data of the selected district
    remDrv$findElement(using = "xpath",
                       "//input[@value = 'Submit']")$clickElement()
    Sys.sleep(2)

    ######### if ##########
    if (remDrv$findElement("xpath", "//input[@value ='No Records found'")) { #this isnt input value, but rather a "No Records found" lookup
      remDrv$goBack()
      Sys.sleep(2)
    } 
    else {

    # Download data for current district
    district_data <- remDrv$getPageSource()[[1]] %>% 
      htmlParse %>% 
      readHTMLTable %>% 
      extract2(4) %>% 
      extract(c(-1, -2), )

    result <- data.frame(state = state_names[i], district = district_names[j],
                         district_data) %>% rbind(result, .)

    remDrv$goBack()
    Sys.sleep(2)
    }
  }
}

remDrv$quit()
remDrv$closeServer()

result %<>% as_data_frame %>%
  rename(
    si_no = V1,
    holding_size = V2, 
    Individual_Number = V3,
    Individual_Area = V4,
    Joint_Number = V5,
    Joint_Area = V6,
    Subtotal_Number = V7,
    Subtotal_Area = V8,
    Institutional_Number = V9,
    Institutional_Area = V10,
    Total_Number = V11,
    Total_Area = V12
  ) %>% 
  mutate(
    si_no = as.numeric(as.character(si_no))
  )

str(result)
levels(result$state)
levels(result$district)

0 个答案:

没有答案