所以我有一个我想在R Studio中搜索的网址列表
url< -
ls()
我有一个代码可以在列表中有一个网址时抓取列表:
"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=nc"
............
"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=10&_skc=1800&rt=nc"
但是,在我的网址列表中添加多个网址时,我会从get go中遇到以下错误。 如何创建循环函数以便我可以抓取多个网址?
library(rvest)
library(stringr)
# specify the url
url <-"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=ncs"
# read the page
web <- read_html(url)
# define the supernode that has the entire block of information
super_node <- '.li'
# read as vector of all blocks of supernode (imp: use html_nodes function)
super_node_read <- html_nodes(web, super_node)
# define each node element that you want
node_model_details <- '.lvtitle'
node_description_1 <- '.lvtitle+ .lvsubtitle'
node_description_2 <- '.lvsubtitle+ .lvsubtitle'
node_model_price <- '.prc .bold'
node_shipping_info <- '.bfsp'
# extract the output for each as cleaned text (imp: use html_node function)
model_details <- html_node(super_node_read, node_model_details) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_1 <- html_node(super_node_read, node_description_1) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_2 <- html_node(super_node_read, node_description_2) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
model_price <- html_node(super_node_read, node_model_price) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
shipping_info <- html_node(super_node_read, node_shipping_info) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
# create the data.frame
mobile_phone_data <- data.frame(
model_details,
description_1,
description_2,
model_price,
shipping_info
)
答案 0 :(得分:1)
可能没有必要循环,您可以以矢量化方式实现所有这些。你的一些节点选择没有工作,所以我用它们取而代之。我还将您的代码转换为一组两个函数。
library(dplyr)
library(tidyr)
library(tibble)
library(rvest)
library(stringr)
library(purrr)
extract_node <- function(node){
# function that accepts a css selector or xpath to extract
# text from an html node with
super_node_read %>%
html_nodes(node) %>%
html_text()
}
extract_phone_details <- function(url){
# read the page
web <- read_html(url)
# define the supernode that has the entire block of information
super_node <- '.li'
# read as vector of all blocks of supernode (imp: use html_nodes function)
super_node_read <- html_nodes(web, super_node)
# define each node element that you want
node_model_details <- '.lvtitle'
node_model_price <- '.prc'
node_shipping_info <- 'li.lvshipping >span'
# create the tibble
mobile_phone_data <- tibble(
model_details = extract_node(node_model_details),
model_price = extract_node(node_model_price),
shipping_info = extract_node(node_shipping_info))
return(mobile_phone_data)
}
# using the function on a single url
url <-"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=ncs"
df <- extract_phone_details(url)
# now you can apply this same function on any number of urls by using purrr's map function
# and tidyr's unnest
urls <- c("https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=2&_skc=200&rt=nc",
"https://www.ebay.in/sch/i.html?_nkw=Mobile+Phones&_pgn=2&_skc=10&_skc=1800&rt=nc")
df <- tibble(url = urls,
results = map(url, extract_phone_details)) %>%
unnest