R中的Web Scraping with data.frame Rvest

时间:2018-04-30 23:05:02

标签: r web-scraping rvest stringr

所以我有一个我想在R Studio中搜索的网址列表

url< -

ls()

我有一个代码可以在列表中有一个网址时抓取列表:

"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=nc"
............
"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=10&_skc=1800&rt=nc"

但是,在我的网址列表中添加多个网址时,我会从get go中遇到以下错误。 如何创建循环函数以便我可以抓取多个网址?

library(rvest)
library(stringr)

# specify the url
url <-"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=ncs"

# read the page
web <- read_html(url)

# define the supernode that has the entire block of information
super_node <- '.li' 

# read as vector of all blocks of supernode (imp: use html_nodes function)
super_node_read <- html_nodes(web, super_node)

# define each node element that you want
node_model_details <- '.lvtitle'
node_description_1 <- '.lvtitle+ .lvsubtitle'
node_description_2 <- '.lvsubtitle+ .lvsubtitle'
node_model_price   <- '.prc .bold'
node_shipping_info <- '.bfsp'
# extract the output for each as cleaned text (imp: use html_node function)
model_details <- html_node(super_node_read, node_model_details) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")

description_1 <- html_node(super_node_read, node_description_1) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")

description_2 <- html_node(super_node_read, node_description_2) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")

model_price  <- html_node(super_node_read, node_model_price) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")

shipping_info <- html_node(super_node_read, node_shipping_info) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")

# create the data.frame
mobile_phone_data <- data.frame(
    model_details,
    description_1,
    description_2,
    model_price,
    shipping_info
)

1 个答案:

答案 0 :(得分:1)

可能没有必要循环,您可以以矢量化方式实现所有这些。你的一些节点选择没有工作,所以我用它们取而代之。我还将您的代码转换为一组两个函数。

library(dplyr)
library(tidyr)
library(tibble)
library(rvest)
library(stringr)
library(purrr)

extract_node <- function(node){
  # function that accepts a css selector or xpath to extract
  # text from an html node with
  super_node_read %>% 
      html_nodes(node) %>% 
      html_text()
}

extract_phone_details <- function(url){

  # read the page
  web <- read_html(url)

  # define the supernode that has the entire block of information
  super_node <- '.li' 

  # read as vector of all blocks of supernode (imp: use html_nodes function)
  super_node_read <- html_nodes(web, super_node)

  # define each node element that you want
  node_model_details <- '.lvtitle'
  node_model_price   <- '.prc'
  node_shipping_info <- 'li.lvshipping >span'

  # create the tibble
  mobile_phone_data <- tibble(
    model_details = extract_node(node_model_details),
    model_price = extract_node(node_model_price),
    shipping_info = extract_node(node_shipping_info))

  return(mobile_phone_data)
}

# using the function on a single url
url <-"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=ncs"

df <- extract_phone_details(url)

# now you can apply this same function on any number of urls by using purrr's map function
# and tidyr's unnest
urls <- c("https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=nc",
          "https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=10&_skc=1800&rt=nc")

df <- tibble(url = urls,
             results = map(url, extract_phone_details)) %>% 
  unnest