我需要一些帮助来抓取多个页面,因为所有的网址都与第一页类似。请建议解决方法。
第二个问题,我无法使用.mapItem选择每个业务的地址和描述。而不是.mapItem我想提取 data-comp-addr 字段。
library(rvest)
webpage <- read_html("http://www.yellowpages.com.sg/search/all/legal")
webpage
```{r}
name_html <- webpage %>% html_nodes(".normal_title")
name <- html_text(name_html, trim = TRUE)
head(name)
desc_html <- webpage %>% html_nodes(".mapItem")
head(desc_html)
```
library(readr)
df = data.frame(vectorName, vectorAddress, vectorDescription)
glimpse(df)
write_csv(df, "legal.csv")
上面的chunk问题,我复制了vectorDescription的条目,那些空字段是先前条目的重复
答案 0 :(得分:1)
这样你想要刮多页吗?
library(rvest)
library(stringr)
#Login options
urlLog <- 'http://www.yellowpages.com.sg/user/login'
session <- html_session(urlLog)
form <- html_form(read_html(urlLog))[[1]]
filled_form <- set_values(form, searchby = 'Your username here', keys = 'Your password here')
count = 0
for(numPage in 1:3){
# Create and read the page address
page = paste('http://www.yellowpages.com.sg/search/all/legal/?page', '=', numPage)
nPage <- str_replace_all(page, pattern=" ", repl="")
url <- jump_to(session, nPage)
# Read the 20 items of each the page
for(numItem in 1:20){
print(paste(' -- Reading the item', numItem, ' on page', numPage))
item = paste('item', numItem, sep = "", collapse = NULL)
xPath <- paste('//*[@id=',item, ']', sep = "'", collapse = NULL)
# read the path
readPath <- nPage %>% read_html() %>% html_nodes(xpath = xPath)
# Get values
name <- readPath %>% html_nodes(".normal_title") %>% html_text(trim = TRUE)
lengthName <- length(name)
address <- readPath %>% html_nodes(".mapItem") %>% html_text(trim = TRUE)
lengthAddress <- length(address)
bDescription <- readPath %>% html_nodes(".com_business_card") %>% html_text(trim = TRUE)
lengthDesc <- length(bDescription)
if(lengthName == 0){
name = 'NA'
}
if(lengthDesc == 0){
lengthAddress = 'NA'
}
if(lengthDesc == 0){
bDescription = 'NA'
}
# Store variables
if(count == 0){
vectorName <- name
vectorAddress <- address
vectorDescription <- bDescription
}else{
vectorName <- c(vectorName, name)
vectorAddress <- c(vectorAddress, address)
vectorDescription <- c(vectorDescription, bDescription)
}
count = count + 1
}
}
df <- data.frame(vectorName, vectorAddress, vectorDescription)