我想使用下面的scrape_change_page函数将指向单个请愿的链接添加到数据帧,但是我不确定如何进行调整以包含html_attr函数,该函数将从多个页面中抓取URL链接。任何想法都非常欢迎!
library(pacman)
pacman::p_load(rvest, dplyr, stringr, purrr, lubridate, tibble, tidyr, stringi, stringr)
url <- 'https://www.change.org/search?q=ppe'
scrape_change_page <- function(url)
{
webpage <- xml2::read_html(url)
get_text <- function(css)
{
vec <- rvest::html_text(rvest::html_nodes(webpage, css), trim = TRUE)
if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
}
dplyr::tibble(
title = get_text('.xs-mbs'),
date = gsub("Created", "", get_text('.symbol-clock+ span')),
supporters = gsub(" supporters", "", get_text('.symbol-supporters+ span')),
addressee = gsub("Petition to ", "", get_text('.xs-mbn .type-s')),
location = get_text('.plxxs'),
)
}
scrape_change_page(url)
#select number of pages(test on 3 pages)
n_pages <- 3
urls <- paste0(url, "&offset=", 10 * (seq(n_pages)) - 1)
result <- do.call(rbind, lapply(urls, scrape_change_page))
以下是抓取链接的基本html_attr:
page <- read_html("https://www.change.org/search?q=ppe")
page %>%
html_nodes(".search-results .list-rule") %>%
html_nodes("a") %>%
html_attr("href")
答案 0 :(得分:0)
通过更新功能来解决。下面更新了代码。
scrape_change_page <- function(url)
{
webpage <- xml2::read_html(url)
get_text <- function(css)
{
vec <- rvest::html_text(rvest::html_nodes(webpage, css), trim = TRUE)
if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
}
get_attr <- function(css, attr)
{
vec <- rvest::html_attr(rvest::html_nodes(webpage, css), attr)
if(length(vec) < 10) c(vec, rep("", 10 - length(vec))) else vec
}
dplyr::tibble(
title = get_text('.xs-mbs'),
date = gsub("Created", "", get_text('.symbol-clock+ span')),
supporters = gsub(" supporters", "", get_text('.symbol-supporters+ span')),
addressee = gsub("Petition to ", "", get_text('.xs-mbn .type-s')),
location = get_text('.plxxs'),
link = get_attr('.search-results .list-rule a.link-block.js-click-search-result', 'href')
)
}