如果网址未发生变化,您如何通过R抓取网页

时间:2018-02-15 12:10:57

标签: r web-scraping rvest

我需要一些帮助来抓取多个页面,因为所有的网址都与第一页类似。请建议解决方法。

第二个问题,我无法使用.mapItem选择每个业务的地址和描述。而不是.mapItem我想提取 data-comp-addr 字段。

library(rvest)
webpage <- read_html("http://www.yellowpages.com.sg/search/all/legal")
webpage

```{r}
name_html <- webpage %>% html_nodes(".normal_title")
name <- html_text(name_html, trim = TRUE)
head(name)

desc_html <- webpage %>% html_nodes(".mapItem")
head(desc_html)
```

尝试写入csv

library(readr)
df = data.frame(vectorName, vectorAddress, vectorDescription)
glimpse(df)

write_csv(df, "legal.csv")

上面的chunk问题,我复制了vectorDescription的条目,那些空字段是先前条目的重复

1 个答案:

答案 0 :(得分:1)

这样你想要刮多页吗?

library(rvest)
library(stringr)

#Login options
urlLog <- 'http://www.yellowpages.com.sg/user/login'
session <- html_session(urlLog)
form <- html_form(read_html(urlLog))[[1]]
filled_form <- set_values(form, searchby = 'Your username here', keys = 'Your password here')

count = 0

for(numPage in 1:3){

        # Create and read the page address
        page = paste('http://www.yellowpages.com.sg/search/all/legal/?page', '=', numPage)
        nPage <- str_replace_all(page, pattern=" ", repl="")
        url <- jump_to(session, nPage)

        # Read the 20 items of each the page
        for(numItem in 1:20){

                print(paste(' -- Reading the item', numItem, ' on page', numPage))

                item =  paste('item', numItem, sep = "", collapse = NULL)
                xPath <- paste('//*[@id=',item, ']', sep = "'", collapse = NULL)

                # read the path
                readPath <- nPage %>% read_html() %>% html_nodes(xpath = xPath)

                # Get values
                name  <- readPath %>% html_nodes(".normal_title") %>% html_text(trim = TRUE)
                lengthName <- length(name)

                address <- readPath %>% html_nodes(".mapItem") %>% html_text(trim = TRUE)
                lengthAddress <- length(address)

                bDescription <- readPath %>% html_nodes(".com_business_card") %>% html_text(trim = TRUE)
                lengthDesc <- length(bDescription)

                if(lengthName == 0){

                        name = 'NA'

                }

                if(lengthDesc == 0){

                        lengthAddress = 'NA'

                }

                if(lengthDesc == 0){

                        bDescription = 'NA'

                }

                # Store variables
                if(count == 0){

                        vectorName <- name
                        vectorAddress <- address
                        vectorDescription <- bDescription
                }else{

                        vectorName <- c(vectorName, name)
                        vectorAddress <- c(vectorAddress, address)
                        vectorDescription <- c(vectorDescription, bDescription)

                }


                count = count + 1

        }


}


df <- data.frame(vectorName, vectorAddress, vectorDescription)