如何循环rvest::follow_link()
函数来抓取链接的网页?
用例:
我需要的选择器如下:
library(rvest)
lego_movie <- html("http://www.imdb.com/title/tt1490017/")
lego_movie <- lego_movie %>%
html_nodes(".itemprop , .character a") %>%
html_text()
# follow cast links
(".itemprop .itemprop")
# grab tables of all movies and dates for each cast member
(".year_column , b a")
期望的输出:
castMember movie year
Will Arnett Lego 2017
Will Arnett BoJack 2014
Will Arnett Wander 2014
............
Elizabeth Banks Moonbeam 2015
Elizabeth Banks Wet Hot 2015
............
Alison Brie Get Hard 2015
Alison Brie GetaJob 2015
.....etc.....
答案 0 :(得分:3)
也许这样的事情可行。
library(rvest)
library(stringr)
library(data.table)
lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- lego_movie %>%
html_nodes("#titleCast .itemprop span") %>%
html_text()
cast
s <- html_session("http://www.imdb.com/title/tt1490017/")
cast_movies <- list()
for(i in cast[1:3]){
actorpage <- s %>% follow_link(i) %>% read_html()
cast_movies[[i]]$movies <- actorpage %>%
html_nodes("b a") %>% html_text() %>% head(10)
cast_movies[[i]]$years <- actorpage %>%
html_nodes("#filmography .year_column") %>% html_text() %>%
head(10) %>% str_extract("[0-9]{4}")
cast_movies[[i]]$name <- rep(i, length(cast_movies[[i]]$years))
}
cast_movies
as.data.frame(cast_movies[[1]])
rbindlist(cast_movies)
答案 1 :(得分:2)
这是未经测试的,所以可能是错的。我会逐步完成它并验证它是否正确。我不确定如何在这种情况下使用follow_link ......但这就是我想出来的......
library("rvest")
library("stringr")
lego_movie <- html("http://www.imdb.com/title/tt1490017/")
links <- lego_movie %>%
html() %>%
html_nodes(".itemprop , a") %>% xml_attr("href")
links[is.na(links)] <- ""
actors <- lego_movie %>%
html() %>%
html_nodes(".itemprop , a") %>%
html_text()
df <- data.frame(name=actors, link=links, stringsAsFactors=F)
df <- subset(df, substring(link, 2, 5)=="name")
df <- subset(df, name!="")
df$name <- gsub("\\n", "", df$name)
df$name <- str_trim(df$name)
df <- df[order(df$name),]
df <- subset(df, !duplicated(df$name))
get_movies <- function(name, link){
url <- paste0("http://www.imdb.com", link)
movies <- url %>%
html() %>%
html_nodes(".year_column , b a") %>%
html_text()
# take care of random date at top of some actors stuff...
if(length(movies)%%2==1){movies <- movies[-1]}
movies <- gsub("\\n", "", movies)
movies <- str_trim(movies)
df <- data.frame(date=movies[seq(1, length(movies), 2)],
movie=movies[seq(2, length(movies), 2)],
stringsAsFactors=F)
df <- cbind(name=rep(name, nrow(df)), df)
return(df)
}
final_df <- data.frame()
for(i in 1:nrow(df)){
final_df <- rbind(final_df, get_movies(df$name[i], df$link[i]))
}