我正在尝试使用rvest从电子商务网站中删除一些数据。我没有找到任何好的例子来指导我。有什么想法吗?
以我的开始为例:
library(rvest)
library(purrr)
#Specifying the url
url_base <- 'https://telefonia.mercadolibre.com.uy/accesorios-celulares/'
#Reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrap the titles section
title_html <- html_nodes(webpage,'.main-title')
#Converting the title data to text
title <- html_text(title_html)
head(title)
#Using CSS selectors to scrap the price section
price <- html_nodes(webpage,'.item__price')
price <- html_text(price)
price
所以,我想做两件基本的事情:
任何帮助?
谢谢。
答案 0 :(得分:0)
抓住信息并不困难,并且可以用rvest做
你需要做的是获得所有的hrefs并循环它们。为此,您需要使用html_attr()
以下代码应该完成这项工作:
library(tidyverse)
library(rvest)
#Specifying the url
url_base <- 'https://telefonia.mercadolibre.com.uy/accesorios-celulares/'
#You need to get href and loop on hrefs
all_pages <- url_base %>% read_html() %>% html_nodes(".pagination__page > a") %>% html_attr("href")
all_pages[1] <- url_base
#create an empty table to store results
result_table <- tibble()
for(page in all_pages){
page_source <- read_html(page)
title <- html_nodes(page_source,'.item__info-title') %>% html_text()
price <- html_nodes(page_source,'.item__price') %>% html_text()
item_link <- html_nodes(page_source,'.item__info-title') %>% html_attr("href")
temp_table <- tibble(title = title, price = price, item_link = item_link)
result_table <- bind_rows(result_table,temp_table)
}
在获得每个项目的链接后,您可以循环链接项目。
如您所见,后缀中有一个模式;您只需每次添加数字50即可浏览更多页面。
> all_pages
[1] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/"
[2] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_51"
[3] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_101"
[4] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_151"
[5] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_201"
[6] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_251"
[7] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_301"
[8] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_351"
[9] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_401"
[10] "https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_451"
所以我们可以这样做:
str_c("https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_",seq.int(from = 51,by = 50,length.out = 40))
pagesource <- read_html("https://articulo.mercadolibre.com.uy/MLU-449598178-protector-funda-clear-cover-samsung-galaxy-note-8-_JM")
n_vendor <- pagesource %>% html_node(".item-conditions") %>% html_text() %>% remove_nt()
product_description <- pagesource %>% html_node(".item-title__primary") %>% html_text() %>% remove_nt()
n_opinion <- pagesource %>% html_node(".average-legend span:nth-child(1)") %>% html_text()
product_price <- pagesource %>% html_nodes(".price-tag-fraction") %>% html_text()
current_table <- tibble(product_description = product_description,
product_price = product_price,
n_vendor = n_vendor,
n_opinion = n_opinion)
print(current_table)
# A tibble: 1 x 4
product_description product_price n_vendor n_opinion
<chr> <chr> <chr> <chr>
1 Protector Funda Clear Cover Samsung Galaxy Note 8 14 14vendidos 2
您可以循环上面的代码块并获取所有信息。
以下代码应该可以使用,您可以删除5页限制来清除所有产品信息。
library(tidyverse)
library(rvest)
#Specifying the url
url_base <- 'https://telefonia.mercadolibre.com.uy/accesorios-celulares/'
#You need to get href and loop on hrefs
all_pages <- url_base %>% read_html() %>% html_nodes(".pagination__page > a") %>% html_attr("href")
all_pages <- c(url_base,
str_c("https://telefonia.mercadolibre.com.uy/accesorios-celulares/_Desde_",
seq.int(from = 51,by = 50,length.out = 40)))
#create an empty table to store results
result_table <- tibble()
for(page in all_pages[1:5]){ #as an example, only scrape the first 5 pages
page_source <- read_html(page)
title <- html_nodes(page_source,'.item__info-title') %>% html_text()
price <- html_nodes(page_source,'.item__price') %>% html_text()
item_link <- html_nodes(page_source,'.item__info-title') %>% html_attr("href")
temp_table <- tibble(title = title, price = price, item_link = item_link)
result_table <- bind_rows(result_table,temp_table)
}
#loop on result table(item_link):
product_table <- tibble()
for(i in 1:nrow(result_table)){
pagesource <- read_html(result_table[[i,"item_link"]])
n_vendor <- pagesource %>% html_node(".item-conditions") %>% html_text() %>% remove_nt()
product_description <- pagesource %>% html_node(".item-title__primary") %>% html_text() %>% remove_nt()
currency_symbol <- pagesource %>% html_node(".price-tag-symbol") %>% html_text()
n_opinion <- pagesource %>% html_node(".average-legend span:nth-child(1)") %>% html_text()
product_price <- pagesource %>% html_nodes(".price-tag-fraction") %>% html_text()
current_table <- tibble(product_description = product_description,
currency_symbol = currency_symbol,
product_price = product_price,
n_vendor = n_vendor,
n_opinion = n_opinion,
item_link = result_table[[i,"item_link"]])
product_table <- bind_rows(product_table,current_table)
}
结果:
代码中仍然存在一些错误,例如:
在此页面上,有两个项目与css选择器匹配,这可能会破坏代码。但是有一些解决方案: