我很难收集西班牙城市随时间变化的数据。我一直在尝试使用this answer作为指导,但我认为我遇到了问题,因为要从一个下拉菜单移动到另一个下拉菜单,需要单击一个按钮。我无法弄清楚按钮在代码中的工作位置。
到目前为止我所拥有的:
#access web page
remDr <- remoteDriver(browserName="firefox", port=4444)
remDr$open(silent=T)
remDr$setImplicitWaitTimeout(3000)
remDr$navigate("http://www.ine.es/intercensal/")
我可以通过在三个元素(自治社区,省和市)中输入值并单击按钮来单独访问我想要的表格,例如:
remDr$executeScript("document.getElementById('cmbCCAA').value = 1;")
ConsultProv <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
ConsultProv$clickElement()
remDr$executeScript("document.getElementById('cmbProv').value = 4;")
ConsultMuni <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
ConsultMuni$clickElement()
remDr$executeScript("document.getElementById('cmbMuni').value = 5861;")
SubmitMuni <- remDr$findElement(using = 'name', value = 'btnBuscarGeo')
SubmitMuni$clickElement()
虽然代码可能不够优雅(道歉,但我刚开始学习R),我可以用以下方式提取单个表数据:
readname <- remDr$findElement(using = 'css selector', value = '.TITULOH3')
namedata <- readname$getElementText()
cleanname1 <- gsub("Province: ", "", namedata)
separatename1 <- str_split(cleanname1, "Municipality: ")
separatename2 <- unlist(separatename1, recursive = TRUE, use.names = TRUE)
trimmed <- trimws(separatename2)
names(trimmed) <- data.frame("Province", "Municipality", stringsAsFactors = FALSE)
finalname <- t(trimmed)
table <- htmlParse(remDr$getPageSource()[[1]])
readdfdata <- readHTMLTable(table, skip.rows = c(1, 2, 4, 5), stringsAsFactors = FALSE)
tabledfdata <- as.data.frame(readdfdata[2])
names(tabledfdata) <- c("skip", "dfpop1842", "dfpop1857", "dfpop1860", "dfpop1877", "dfpop1887", "dfpop1897",
"dfpop1900", "dfpop1910", "dfpop1920", "dfpop1930", "dfpop1940", "dfpop1950", "dfpop1960",
"dfpop1970", "dfpop1981", "dfpop1991", "dfpop2001", "dfpop2011")
cleandftable <- subset(tabledfdata, select = -c(skip))
finaldfpop <- cleandftable
readdjdata <- readHTMLTable(table, skip.rows = c(1, 2, 3, 5), stringsAsFactors = FALSE)
tabledjdata <- as.data.frame(readdjdata[2])
names(tabledjdata) <- c("skip", "djpop1842", "djpop1857", "djpop1860", "djpop1877", "djpop1887", "djpop1897",
"djpop1900", "djpop1910", "djpop1920", "djpop1930", "djpop1940", "djpop1950", "djpop1960",
"djpop1970", "djpop1981", "djpop1991", "djpop2001", "djpop2011")
cleandjtable <- subset(tabledjdata, select = -c(skip))
finaldjpop <- cleandjtable
readhhdata <- readHTMLTable(table, skip.rows = c(1, 2, 3, 4), stringsAsFactors = FALSE)
tablehhdata <- as.data.frame(readhhdata[2])
names(tablehhdata) <- c("skip", "hh1842", "hh1857", "hh1860", "hh1877", "hh1887", "hh1897",
"hh1900", "hh1910", "hh1920", "hh1930", "hh1940", "hh1950", "hh1960",
"hh1970", "hh1981", "hh1991", "hh2001", "hh2011")
cleanhhtable <- subset(tablehhdata, select = -c(skip))
finalhhpop <- cleanhhtable
readchange <- remDr$findElement(using = 'css selector', value = '.alteraciones')
changedata <- readchange$getElementText()
separatechange1 <- str_split(changedata, "\\n ")
separatechange2 <- unlist(separatechange1, recursive = TRUE, use.names = TRUE)
finalchange <- t(separatechange2)
complete <- data.frame(finalname, finaldfpop, finaldjpop, finalhhpop, finalchange, stringsAsFactors = FALSE)
但是,如何循环此过程以收集所有城市的数据?我尝试使用上一个答案中的changeFun,同时还包含了按钮,但以下代码卡住了:
#Pre-existing changeFun
changeFun <- function(value, elementName, targetName){
changeElem <- remDr$findElement(using = "name", elementName)
script <- paste0("arguments[0].value = '", value, "'; arguments[0].onchange();")
remDr$executeScript(script, list(changeElem))
targetCodes <- c()
while(length(targetCodes) == 0){
targetElem <- remDr$findElement(using = "name", targetName)
target <- xmlParse(targetElem$getElementAttribute("outerHTML")[[1]])
targetCodes <- sapply(querySelectorAll(target, "option"), xmlGetAttr, "value")[-1]
target <- sapply(querySelectorAll(target, "option"), xmlValue)[-1]
if(length(targetCodes) == 0){
Sys.sleep(0.5)
}else{
out <- list(target, targetCodes)
}
}
return(out)
}
#This part works fine in getting autonoma names and codes
autonomas <- remDr$findElement(using = "name", "cmbCCAA")
autonomasdata <- autonomas$getElementAttribute("outerHTML")[[1]]
autocodes <- sapply(querySelectorAll(xmlParse(autonomasdata), "option"), xmlGetAttr, "value")[-1]
autos <- sapply(querySelectorAll(xmlParse(autonomasdata), "option"), xmlValue)[-1]
#But this part for getting Province and Municipality names and codes does not run
autonoma <- list()
for(x in seq_along(autocodes)){
ConsultProv <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
ConsultProv$clickElement()
province <- changeFun(autocodes[[x]], "cmbCCAA", "cmbProv")
municipality <- lapply(province[[2]], function(y){
ConsultMuni <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
ConsultMuni$clickElement()
municipality <- changeFun(y, "cmbProv", "cmbMuni")
municipality}
)
list(municipality)}
)
autonoma[[x]] <- list(province, municipality)
}
关于如何使循环工作的任何建议将不胜感激。我认为我遇到的问题是合并按钮点击,但我似乎无法弄清楚如何做到这一点。谢谢!
编辑 我已经做了一些搞乱这个,我仍然认为问题在于合并按钮点击。我在下面创建了一个弗兰肯斯坦的代码怪物,它可以做某些事情。基本上,我认为我需要在一个executeScript命令中将JavaScript合并到ChangeFun或更大的代码中。不确定我是否以这种方式缩小范围会有所帮助,但我想我会更新每个人(缺乏)进展。
sel_auto <- remDr$findElement(using = 'name', value = 'cmbCCAA')
raw_auto <- sel_auto$getElementAttribute("outerHTML")[[1]]
num_auto <- sapply(querySelectorAll(xmlParse(raw_auto), "option"), xmlGetAttr, "value")[-1]
nam_auto <- sapply(querySelectorAll(xmlParse(raw_auto), "option"), xmlValue)[-1]
for (i in seq_along(num_auto)){
remDr$executeScript("document.getElementById('cmbCCAA').value = 1;")
clk_auto <- remDr$findElement(using = 'name', value = 'btnBuscarProv')
clk_auto$clickElement()
sel_prov <- remDr$findElement(using = 'name', value = 'cmbProv')
raw_prov <- sel_prov$getElementAttribute("outerHTML")[[1]]
num_prov <- sapply(querySelectorAll(xmlParse(raw_prov), "option"), xmlGetAttr, "value")[-1]
nam_prov <- sapply(querySelectorAll(xmlParse(raw_prov), "option"), xmlValue)[-1]
for (j in seq_along(num_prov)){
sel_prov$clickElement()
clk_prov <- remDr$findElement(using = 'name', value = 'btnBuscarMuni')
clk_prov$clickElement()
sel_muni <- remDr$findElement(using = 'name', value = 'cmbMuni')
raw_muni <- sel_muni$getElementAttribute("outerHTML")[[1]]
num_muni <- sapply(querySelectorAll(xmlParse(raw_muni), "option"), xmlGetAttr, "value")[-1]
nam_muni <- sapply(querySelectorAll(xmlParse(raw_muni), "option"), xmlValue)[-1]
}
}