两个问题。有没有一种方法可以简化此代码,以免我每次都不重新运行对提供程序的read_html调用?另外,任何人都可以帮助我将数据组合到data.frame中。我为此感到挣扎。特别是因为它涉及不相等的那些长度不相等的列表(不过这里不是这种情况)。下面的代码:
######################################
#####Trip Advisor Review Scraping#####
#####by Chris 09/11/18#####
#Packages
library(stringr)
library(rvest) #########NOTE#############
library(xml2) ###CSS Selectors Change###
library(magrittr)
#Get Webpage
webpage1 <- xml2::read_html("https://www.tripadvisor.com/Restaurant_Review-g58375-d882036-Reviews-Azteca_Mexican_Restaurant-Burien_Washington.html")
url1 <- "https://www.tripadvisor.com/Restaurant_Review-g58375-d882036-Reviews-"
url2 <- c("", "or10","or20")
url3 <- "-Azteca_Mexican_Restaurant-Burien_Washington.html"
final_url <- paste0(url1,url2,url3)
#Get Max Pages
max_pages <- function(html){
html <- read_html(html)%>%
html_nodes("#taplc_location_reviews_list_0 > div > div:nth-child(24) > div > div > div > a.pageNum.last.taLnk")%>%
html_text()%>%
unname()%>%
as.numeric()
}
#Get Ratings
get_ratings <- function(html){
html <- read_html(html)%>%
html_nodes("#taplc_location_reviews_list_0 .ui_bubble_rating")%>%
as.character()%>%
str_extract("\\d+")%>%
unlist()
}
#Get Date
date <- function(html){
html <- read_html(html)%>%
html_nodes(".rating .ratingDate") %>%
html_attr("title") %>%
strptime("%b %d, %Y") %>%
as.POSIXct()%>%
unlist()
}
#Review
review <- function(html){
html <- read_html(html)%>%
html_nodes(".entry .partial_entry") %>%
html_text()%>%
unlist()
}
#Title of Review
title <- function(html){
html <- read_html(html)%>%
html_nodes(".noQuotes")%>%
html_text()%>%
unlist()%>%
as.data.frame()
}
#location of reviewer
user_location <- function(html){
html <- read_html(html)%>%
html_nodes(".userLocation")%>%
html_text()%>%
unlist()%>%
as.data.frame()
}
user_location <- lapply(final_url, user_location)
title <- lapply(final_url, title)
review <- lapply(final_url, review)
date <- lapply(final_url, date)
get_ratings <- lapply(final_url, get_ratings)
df <- as.data.frame(
title = ifelse(length(title) == 0, NA, title),
review = ifelse(length(review) == 0, NA, review),
date = ifelse(length(date) == 0, NA, date),
get_ratings = ifelse(length(get_ratings) == 0, NA, get_ratings),
user_location = ifelse(length(user_location) == 0, NA, user_location),
stringsAsFactors = F)
您可以提供的任何帮助都会有所帮助。