Question

两个问题。有没有一种方法可以简化此代码，以免我每次都不重新运行对提供程序的read_html调用？另外，任何人都可以帮助我将数据组合到data.frame中。我为此感到挣扎。特别是因为它涉及不相等的那些长度不相等的列表（不过这里不是这种情况）。下面的代码：

    ######################################
#####Trip Advisor Review Scraping#####     
#####by Chris  09/11/18#####

#Packages
library(stringr) 
library(rvest)                                #########NOTE############# 
library(xml2)                                 ###CSS Selectors Change###
library(magrittr)

#Get Webpage
webpage1 <- xml2::read_html("https://www.tripadvisor.com/Restaurant_Review-g58375-d882036-Reviews-Azteca_Mexican_Restaurant-Burien_Washington.html")

url1 <- "https://www.tripadvisor.com/Restaurant_Review-g58375-d882036-Reviews-"
url2 <- c("", "or10","or20")
url3 <- "-Azteca_Mexican_Restaurant-Burien_Washington.html"
final_url <- paste0(url1,url2,url3)

#Get Max Pages

max_pages <- function(html){
  html <- read_html(html)%>%
  html_nodes("#taplc_location_reviews_list_0 > div > div:nth-child(24) > div > div > div > a.pageNum.last.taLnk")%>%
    html_text()%>%
    unname()%>%
    as.numeric()
}


#Get Ratings
  get_ratings <- function(html){
    html <- read_html(html)%>%
      html_nodes("#taplc_location_reviews_list_0 .ui_bubble_rating")%>%
      as.character()%>%
    str_extract("\\d+")%>%
    unlist()
  }

#Get Date
date <- function(html){
  html <- read_html(html)%>%
        html_nodes(".rating .ratingDate") %>%
        html_attr("title") %>%
        strptime("%b %d, %Y") %>%
        as.POSIXct()%>%
      unlist()
}

#Review
review <- function(html){
  html <- read_html(html)%>%
  html_nodes(".entry .partial_entry") %>%
  html_text()%>%
  unlist()
}

#Title of Review
title <- function(html){
  html <- read_html(html)%>%
    html_nodes(".noQuotes")%>%
    html_text()%>%
    unlist()%>%
    as.data.frame()
}

#location of reviewer
user_location <- function(html){
  html <- read_html(html)%>%
    html_nodes(".userLocation")%>%
    html_text()%>%
    unlist()%>%
    as.data.frame()
}

user_location <- lapply(final_url, user_location)
title <- lapply(final_url, title)
review <- lapply(final_url, review)
date <- lapply(final_url, date)
get_ratings <- lapply(final_url, get_ratings)

df <- as.data.frame(
  title = ifelse(length(title) == 0, NA, title),
  review = ifelse(length(review) == 0, NA, review), 
  date = ifelse(length(date) == 0, NA, date), 
  get_ratings = ifelse(length(get_ratings) == 0, NA, get_ratings),
  user_location = ifelse(length(user_location) == 0, NA, user_location),
  stringsAsFactors = F)

您可以提供的任何帮助都会有所帮助。

如何防止重新运行对提供商的呼叫

0 个答案: