如何使用r vest从网站上抓取所有页面(1,2,3,..... n)

时间:2016-08-24 16:59:59

标签: r web-scraping rvest

#我想阅读.html文件列表来提取数据。感谢您的帮助。

library(rvest)
library(XML)
library(stringr)
library(data.table)
library(RCurl)

u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- ("C:/R/BNB/")
pages <- html_text(html_node(u1, ".results_count"))
Total_Pages <- substr(pages, 4, 7)
TP <- as.numeric(Total_Pages)
# reading first two pages, writing them as separate .html files
for (i in 1:TP) {
  url <- paste(u0, "page=/", i, sep = "")
  download.file(url, paste(download_folder, i, ".html", sep = ""))
  #create html object
  html <- html(paste(download_folder, i, ".html", sep = ""))
}

1 个答案:

答案 0 :(得分:0)

这是一个潜在的解决方案:

library(rvest)
library(stringr)

u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- getwd()  #note change in output directory

TP<-max(as.integer(html_text(html_nodes(u1,"a.page-numbers"))), na.rm=TRUE)

# reading first two pages, writing them as separate .html files 
for (i in 1:TP ) {
  url <- paste(u0,"page/",i, "/", sep="")
  print(url)
  download.file(url,paste(download_folder,i,".html",sep=""))
  #create html object
  html <- read_html(paste(download_folder,i,".html",sep=""))
}

我在html中找不到类.result-count,所以我找了page-numbers类并选择了最高的返回值。 此外,函数html已弃用,因此我将其替换为read_html。 祝你好运