使用带有按钮的RSelenium循环下拉菜单收集数据

时间:2016-04-15 14:24:01

标签: javascript r button dropdown rselenium

我很难收集西班牙城市随时间变化的数据。我一直在尝试使用this answer作为指导,但我认为我遇到了问题,因为要从一个下拉菜单移动到另一个下拉菜单,需要单击一个按钮。我无法弄清楚按钮在代码中的工作位置。

到目前为止我所拥有的:

#access web page
remDr <- remoteDriver(browserName="firefox", port=4444)
remDr$open(silent=T)
remDr$setImplicitWaitTimeout(3000)
remDr$navigate("http://www.ine.es/intercensal/")

我可以通过在三个元素(自治社区,省和市)中输入值并单击按钮来单独访问我想要的表格,例如:

remDr$executeScript("document.getElementById('cmbCCAA').value = 1;")
ConsultProv <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
ConsultProv$clickElement()

remDr$executeScript("document.getElementById('cmbProv').value = 4;")
ConsultMuni <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
ConsultMuni$clickElement()

remDr$executeScript("document.getElementById('cmbMuni').value = 5861;")
SubmitMuni <- remDr$findElement(using = 'name', value = 'btnBuscarGeo')
SubmitMuni$clickElement()

虽然代码可能不够优雅(道歉,但我刚开始学习R),我可以用以下方式提取单个表数据:

readname <- remDr$findElement(using = 'css selector', value = '.TITULOH3')
namedata <- readname$getElementText()
cleanname1 <- gsub("Province: ", "", namedata)
separatename1 <- str_split(cleanname1, "Municipality: ")
separatename2 <- unlist(separatename1, recursive = TRUE, use.names = TRUE)
trimmed <- trimws(separatename2)
names(trimmed) <- data.frame("Province", "Municipality", stringsAsFactors = FALSE)
finalname <- t(trimmed)

table <- htmlParse(remDr$getPageSource()[[1]])

readdfdata <- readHTMLTable(table, skip.rows = c(1, 2, 4, 5), stringsAsFactors = FALSE)
tabledfdata <- as.data.frame(readdfdata[2])
names(tabledfdata) <- c("skip", "dfpop1842", "dfpop1857", "dfpop1860", "dfpop1877", "dfpop1887", "dfpop1897", 
        "dfpop1900", "dfpop1910", "dfpop1920", "dfpop1930", "dfpop1940", "dfpop1950", "dfpop1960", 
        "dfpop1970", "dfpop1981", "dfpop1991", "dfpop2001", "dfpop2011")
cleandftable <- subset(tabledfdata, select = -c(skip))
finaldfpop <- cleandftable

readdjdata <- readHTMLTable(table, skip.rows = c(1, 2, 3, 5), stringsAsFactors = FALSE)
tabledjdata <- as.data.frame(readdjdata[2])
names(tabledjdata) <- c("skip", "djpop1842", "djpop1857", "djpop1860", "djpop1877", "djpop1887", "djpop1897", 
        "djpop1900", "djpop1910", "djpop1920", "djpop1930", "djpop1940", "djpop1950", "djpop1960", 
        "djpop1970", "djpop1981", "djpop1991", "djpop2001", "djpop2011")
cleandjtable <- subset(tabledjdata, select = -c(skip))
finaldjpop <- cleandjtable

readhhdata <- readHTMLTable(table, skip.rows = c(1, 2, 3, 4), stringsAsFactors = FALSE)
tablehhdata <- as.data.frame(readhhdata[2])
names(tablehhdata) <- c("skip", "hh1842", "hh1857", "hh1860", "hh1877", "hh1887", "hh1897", 
        "hh1900", "hh1910", "hh1920", "hh1930", "hh1940", "hh1950", "hh1960", 
        "hh1970", "hh1981", "hh1991", "hh2001", "hh2011")
cleanhhtable <- subset(tablehhdata, select = -c(skip))
finalhhpop <- cleanhhtable

readchange <- remDr$findElement(using = 'css selector', value = '.alteraciones')
changedata <- readchange$getElementText()
separatechange1 <- str_split(changedata, "\\n ")
separatechange2 <- unlist(separatechange1, recursive = TRUE, use.names = TRUE)
finalchange <- t(separatechange2)

complete <- data.frame(finalname, finaldfpop, finaldjpop, finalhhpop, finalchange, stringsAsFactors = FALSE)

但是,如何循环此过程以收集所有城市的数据?我尝试使用上一个答案中的changeFun,同时还包含了按钮,但以下代码卡住了:

#Pre-existing changeFun
changeFun <- function(value, elementName, targetName){
  changeElem <- remDr$findElement(using = "name", elementName)
  script <- paste0("arguments[0].value = '", value, "'; arguments[0].onchange();")
  remDr$executeScript(script, list(changeElem))
  targetCodes <- c()
  while(length(targetCodes) == 0){
    targetElem <- remDr$findElement(using = "name", targetName) 
    target <- xmlParse(targetElem$getElementAttribute("outerHTML")[[1]])
    targetCodes <- sapply(querySelectorAll(target, "option"), xmlGetAttr, "value")[-1]
    target <- sapply(querySelectorAll(target, "option"), xmlValue)[-1]
    if(length(targetCodes) == 0){
      Sys.sleep(0.5)
    }else{
      out <- list(target, targetCodes)
    }
  }
  return(out)
}

#This part works fine in getting autonoma names and codes
autonomas <- remDr$findElement(using = "name", "cmbCCAA") 
autonomasdata <- autonomas$getElementAttribute("outerHTML")[[1]]
autocodes <- sapply(querySelectorAll(xmlParse(autonomasdata), "option"), xmlGetAttr, "value")[-1]
autos <- sapply(querySelectorAll(xmlParse(autonomasdata), "option"), xmlValue)[-1]

#But this part for getting Province and Municipality names and codes does not run
autonoma <- list()
for(x in seq_along(autocodes)){
  ConsultProv <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
  ConsultProv$clickElement()
  province <- changeFun(autocodes[[x]], "cmbCCAA", "cmbProv")
  municipality <- lapply(province[[2]], function(y){
  ConsultMuni <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
  ConsultMuni$clickElement()
    municipality <- changeFun(y, "cmbProv", "cmbMuni")
    municipality}
    )
    list(municipality)}
  ) 
  autonoma[[x]] <- list(province, municipality)
}

关于如何使循环工作的任何建议将不胜感激。我认为我遇到的问题是合并按钮点击,但我似乎无法弄清楚如何做到这一点。谢谢!

编辑 我已经做了一些搞乱这个,我仍然认为问题在于合并按钮点击。我在下面创建了一个弗兰肯斯坦的代码怪物,它可以做某些事情。基本上,我认为我需要在一个executeScript命令中将JavaScript合并到ChangeFun或更大的代码中。不确定我是否以这种方式缩小范围会有所帮助,但我想我会更新每个人(缺乏)进展。

 sel_auto <- remDr$findElement(using = 'name', value = 'cmbCCAA')
 raw_auto <- sel_auto$getElementAttribute("outerHTML")[[1]]
 num_auto <- sapply(querySelectorAll(xmlParse(raw_auto), "option"), xmlGetAttr, "value")[-1]
 nam_auto <- sapply(querySelectorAll(xmlParse(raw_auto), "option"), xmlValue)[-1]
 for (i in seq_along(num_auto)){
   remDr$executeScript("document.getElementById('cmbCCAA').value = 1;")
   clk_auto <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
   clk_auto$clickElement()
   sel_prov <- remDr$findElement(using = 'name', value = 'cmbProv')
   raw_prov <- sel_prov$getElementAttribute("outerHTML")[[1]]
   num_prov <- sapply(querySelectorAll(xmlParse(raw_prov), "option"), xmlGetAttr, "value")[-1]
   nam_prov <- sapply(querySelectorAll(xmlParse(raw_prov), "option"), xmlValue)[-1]
   for (j in seq_along(num_prov)){
     sel_prov$clickElement()
     clk_prov <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
     clk_prov$clickElement()
     sel_muni <- remDr$findElement(using = 'name', value = 'cmbMuni')
     raw_muni <- sel_muni$getElementAttribute("outerHTML")[[1]]
     num_muni <- sapply(querySelectorAll(xmlParse(raw_muni), "option"), xmlGetAttr, "value")[-1]
     nam_muni <- sapply(querySelectorAll(xmlParse(raw_muni), "option"), xmlValue)[-1]
   }
 }

0 个答案:

没有答案