Question

嗨，我是R和编程的新手。我试图爬网一个网站，以便使用rvest和purrr从许多页面中提取数据。但每次运行代码时，“ open.connection（x，“ rb”）中的错误：HTTP错误404。”出现。请帮助

url <- "http://books.toscrape.com/catalogue/page-%d"

map_df(1:10, function(i){ 

  page <- read_html(sprintf(url, i))
   cat(".")

  booksdf <- data.frame(safely( title <- html_nodes(page, "h3, #title") %>% html_text(),
                       price <- html_nodes(page, ".price_color") %>% html_text() %>% gsub("£", "", .),
                       rating <- html_nodes(page, ".star-rating") %>% html_attrs() %>% str_remove("star-rating") %>%str_replace_all(c("One" = "1", "Two" = "2", "Three" = "3", "Four" = "4", "Five" = "5")) %>%  as.numeric()
                       )

  )


} 
)

Error in open.connection(x, "rb") : HTTP error 404.

Answer 1

我们可以创建要刮擦的URL，然后使用map_df将数据框绑定在一起。

library(tidyverse)
library(rvest)

url <- "http://books.toscrape.com/catalogue/page-"
pages <- paste0(url, 1:10, ".html")

map_df(pages, function(i){ 
     page <- read_html(i)
     data.frame(title = html_nodes(page, "h3, #title") %>% html_text(),
                price = html_nodes(page, ".price_color") %>% html_text() %>% 
                        gsub("£", "", .),
                rating = html_nodes(page, ".star-rating") %>% html_attrs() %>% 
                         str_remove("star-rating") %>%
                         str_replace_all(c("One" = "1", "Two" = "2", 
                         "Three" = "3", "Four" = "4", "Five" = "5")) %>%  
                          as.numeric())
})


#                                            title price rating
#1                               A Light in the ... 51.77      3
#2                               Tipping the Velvet 53.74      1
#3                                       Soumission 50.10      1
#4                                    Sharp Objects 47.82      4
#5                     Sapiens: A Brief History ... 54.23      5
#6                                  The Requiem Red 22.65      1
#7                     The Dirty Little Secrets ... 33.34      4
#8                          The Coming Woman: A ... 17.93      3
#.....

open.connection（x，“ rb”）中的RVest错误：HTTP错误404

1 个答案: