Question

我正在尝试从此网站获取数据，其中包含来自里约热内卢的房地产广告：

https://www.zapimoveis.com.br/aluguel/imoveis/rj+rio-de-janeiro/?gclid=EAIaIQobChMIrLjc2u7m2QIVhYGRCh3w9g0GEAAYASAAEgJKdvD_BwE# {％22parametrosautosuggest％22：[{％22Bairro％22：％22％22％22Zona％22：％22％22％22Cidade％22：％22RIO％20DE％20JANEIRO％22 ％22Agrupamento％22：％22％22％22Estado％22：％22RJ％22}]，％22pagina％22：％221％22％22ordem％22：％22Relevancia％22％22paginaOrigem％22：％22ResultadoBusca％ 22，％22semente％22：％222109841258％22％22formato％22：％22Lista％22}

当我逐个进入节点时，我的代码工作正常，但是当我尝试循环遍历xpath节点时，来自包“rvest”的函数html_text（）返回N / A.

以下是我到目前为止编写的代码：

library(rvest)
library(httr)



Url<-"https://www.zapimoveis.com.br/aluguel/imoveis/rj+rio-de-janeiro/?gclid=EAIaIQobChMIrLjc2u7m2QIVhYGRCh3w9g0GEAAYASAAEgJKdvD_BwE#{%22parametrosautosuggest%22:[{%22Bairro%22:%22%22,%22Zona%22:%22%22,%22Cidade%22:%22RIO%20DE%20JANEIRO%22,%22Agrupamento%22:%22%22,%22Estado%22:%22RJ%22}],%22pagina%22:%221%22,%22ordem%22:%22Relevancia%22,%22paginaOrigem%22:%22ResultadoBusca%22,%22semente%22:%222109841258%22,%22formato%22:%22Lista%22}"


website<- GET(Url)


#vectors that will store the data I want to collect
condominio<-vector()
Iptu<-vector()


#loop through nodes
for (i in 1:2){
condominio[i]<- website %>%
  read_html() %>%
html_node(xpath = "/html/body/div[3]/div[2]/section/div/article[i]/section[1]/a/div/span") %>%
html_text()

Iptu[i]<- website %>%
  read_html() %>%
  html_node(xpath = "/html/body/div[3]/div[2]/section/div/article[i]/section[1]/a/div/strong") %>%
  html_text()




}

如果我用固定数字（例如2）替换变量i，代码似乎工作正常。

有人可以帮我找到从更多广告中提取数据的方法吗？

非常感谢！

Answer 1

我更喜欢指定css而不是xpath。尝试这样的事情。

library(rvest)
library(httr)

Url<-"https://www.zapimoveis.com.br/aluguel/imoveis/rj+rio-de-janeiro/?gclid=EAIaIQobChMIrLjc2u7m2QIVhYGRCh3w9g0GEAAYASAAEgJKdvD_BwE#{%22parametrosautosuggest%22:[{%22Bairro%22:%22%22,%22Zona%22:%22%22,%22Cidade%22:%22RIO%20DE%20JANEIRO%22,%22Agrupamento%22:%22%22,%22Estado%22:%22RJ%22}],%22pagina%22:%221%22,%22ordem%22:%22Relevancia%22,%22paginaOrigem%22:%22ResultadoBusca%22,%22semente%22:%222109841258%22,%22formato%22:%22Lista%22}"

website<- GET(Url)

#vectors that will store the data I want to collect
condominio<-vector()
Iptu<-vector()

condominio<- website %>%
  read_html() %>%
  html_nodes("article section a div span") %>%
  html_text()

Iptu<- website %>%
  read_html() %>%
  html_nodes("article section a div strong") %>%
  html_text()

尝试循环遍历xpath节点时的NA R

1 个答案: