RSelenium中的循环和报废

时间:2018-12-27 10:59:24

标签: for-loop rvest rselenium

我正在尝试使用RSelenium从网站上抓取数据。我可以逐个浏览下拉菜单,但是当我循环运行它们时会出错。

在选择下拉列表中的所有值之后,我还想在表格中存储设施名称和联系方式。到目前为止我还无法做到。

rm(list=ls())
setwd("D:\\work_codes\\kvk\\data")
getwd()

library(RSelenium)
library(rvest)
library(XML)
library(RCurl)
library(magrittr)
library(stringr)

rd<-rsDriver()
remDr<-rd[["client"]]

remDr$navigate("https://kvk.icar.gov.in/facilities_list.aspx")

remDr$refresh()

stateEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlState")
states<-stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
states<-str_trim(states, 'left')
stateEle$clickElement()

for (i in 1:length(states)) {
  remDr$refresh()
  stateEle$clickElement()
  stateEle$sendKeysToElement(list(states[i]))
  stateEle$clickElement()
  districts<-NULL
  distEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlDistrict")
  districts<-distEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
  districts<-str_trim(districts, 'left')
  for (j in 1:length(districts)) {
    distEle$clickElement()
    distEle$sendKeysToElement(list(districts[j]))
    distEle$clickElement()
    kvk<-NULL
    kvkEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlKvk")
    kvk<-kvkEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
    kvk<-str_trim(kvk, 'left')
    for (k in 1:length(kvk)) {
      kvkEle$clickElement()
      kvkEle$sendKeysToElement(list(kvk[[1]]))
      kvkEle$clickElement()
      submitEle<-remDr$findElement("id", "ContentPlaceHolder1_btnSubmit")
      submitEle$clickElement()
      doc<-remDr$findElement('id', 'ContentPlaceHolder1_rptfacility_f_name_1')
      doc$getElementText()
      doc$clickElement()
      remDr$findElement('class name','Contact details:')
    }
  }
}

1 个答案:

答案 0 :(得分:1)

library(rvest)
url<-"https://kvk.icar.gov.in/facilities_list.aspx"

page<-html_session(url)
form<-html_form(page)[[1]]

states<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_attr("value")
states<-states[-1]
states_name<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_text()
states_name<-states_name[-1]

final_df<-0
#### STATES LOOP ####
for(i in 1:length(states)){
  filled_form<-set_values(form,
                          "ctl00$ContentPlaceHolder1$ddlState"=states[i])
  page1<-submit_form(page,filled_form)
  district<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_attr("value")
  district<-district[-1]
  district_name<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_text()
  district_name<-district_name[-1]

  #### DISTRICT LOOP ####
  for(j in 1:length(district)){
    filled_form1<-set_values(html_form(page1)[[1]],
                            "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                            "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j])
    page2<-submit_form(page1,filled_form1)
    kvk<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_attr("value")
    kvk<-kvk[-1]
    kvk_name<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_text()
    kvk_name<-kvk_name[-1]

    #### KVK LOOP ####
    for(k in 1:length(kvk)){
      filled_form2<-set_values(html_form(page2)[[1]],
                               "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                               "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j],
                               "ctl00$ContentPlaceHolder1$ddlKvk"=kvk[k])
      page3<-submit_form(page2,filled_form2)
      contact_text<-gsub("[\r\n]","",html_nodes(page3,css=".panel-body") %>% html_text())
      if(length(contact_text) == 0){contact_text=""}
      df<-data.frame(cbind(states_name[i],district_name[j],kvk[k],contact_text))
      names(df)<-c("STATE","DISTRICT","KVK","CONTACT_TEXT")
      final_df[i*j*k] = list(df)
      ### WAITTIME TO AVOID HTTP 500 error - So the server is not overloaded
      sleep(5)
    }
  }
}


output_df<-data.table::rbindlist(final_df,fill=TRUE)

# After this perform some string operations to extract the exact information required from the CONTACT_TEXT variable

以上答案未使用任何RSelenium软件包,我认为这比RSelenium更值得信赖。