评论中提到的@ chinsoon12;这是违反使用TripAdvisor来刮取信息的。但我想知道如何在这个例子中使用rvest中的POST方法。我搜索了谷歌和stackoverflow但答案不是很有帮助。任何一般性的建议也很感激!
所以,我需要点击“更多”按钮查看完整评论。否则它只给出部分评论。
我成功使用Rselenium模拟点击并获得完整评论,但我想知道如何使用rvest和httr。
在观察网络流量后,我发现点击“更多”按钮后,我发送了两个下面列出的POST请求:
我在下面尝试了这个代码,但是正文是空的。
library(rvest)
library(httr)
url <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-Hotel_Bristol-Steamboat_Springs_Colorado.html"
post_to_url <-"https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer="
user_agent_table <- read.csv("https://raw.githubusercontent.com/yusuzech/top-50-user-agents/master/user_agent.csv",stringsAsFactors = F)
post_body <- "reviews=556957481%2C511497076%2C556144452%2C554686822%2C548218482&contextChoice=DETAIL_HR&haveJses=earlyRequireDefine%2Camdearly%2Cglobal_error%2Clong_lived_global%2Capg-Hotel_Review%2Capg-Hotel_Review-in%2Cbootstrap%2Cdesktop-rooms-guests-dust-en_US%2Cresponsive-calendar-templates-dust-en_US%2Ctaevents&haveCsses=apg-Hotel_Review-in&Action=install"
user_agent_list <- user_agent_table$User.agent
random_agent <- function(){user_agent(sample(user_agent_list,1))}
mysession <- html_session(url,random_agent())
result <- POST(url=post_to_url,
config=list(referer = mysession$url),
user_agent(mysession$config$options$useragent),
body=post_body,
encode="raw")
result
> result
Response [https://www.tripadvisor.com/OverlayWidgetAjaxMode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=]
Date: 2018-05-10 01:49
Status: 200
Content-Type: text/html;charset=UTF-8
<EMPTY BODY>
我知道我需要使用POST方法,但我不知道如何设置body和其他配置。我也不确定我是否必须同时发送两个帖子请求以及如何在httr和rvest中实现这一点。
感谢任何帮助!
答案 0 :(得分:1)
我试图像您一样发布rvest:::request_POST
,但失败了。它收到“不允许使用方法(HTTP 405)”错误消息。但是事实证明,我们不必像这样发布rvest:::request_POST
,完整的评论已经在源中,默认情况下不会显示。以下是抓取该酒店所有评论的代码:
library(rvest)
library(stringr)
reviews_df <- data.frame(reviewers = character(),
review_dates = character(),
stars = integer(),
contributions = integer(),
helpful_votes = integer(),
review_titles = character(),
reviews = character())
pages_url <- character();
pages_url[1] <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS";
for (i in 2:42) pages_url[i] <- paste("https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-or", as.character(5 * (i-1)),"-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS", sep = "");
for (i in 1:42){
reviewers <- character();
review_dates <- character();
stars <- integer();
contributions <- integer();
helpful_votes <- integer();
review_titles <- character();
reviews <- character();
page <- read_html(pages_url[i]);
review_nodes <- page %>% html_nodes(xpath = "//div[@data-test-target='reviews-tab']/div[@data-test-target='HR_CC_CARD']");
reviewers <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]") %>% html_text(trim = TRUE);
review_dates <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]/following-sibling::text()[1]") %>% html_text(trim = TRUE);
review_dates <- str_sub(review_dates, start = 16);
star_classes <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-rating']/span[contains(@class, 'ui_bubble_rating')]") %>% html_attr("class");
for (j in 1:length(review_nodes)){
if (grepl("bubble_10", star_classes[j], fixed = TRUE)) stars[j] <- 1
else if (grepl("bubble_20", star_classes[j], fixed = TRUE)) stars[j] <- 2
else if (grepl("bubble_30", star_classes[j], fixed = TRUE)) stars[j] <- 3
else if (grepl("bubble_40", star_classes[j], fixed = TRUE)) stars[j] <- 4
else if (grepl("bubble_50", star_classes[j], fixed = TRUE)) stars[j] <- 5;
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/span")) == 0) contributions[j] <- 0
else contributions[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/span")) == 0) helpful_votes[j] <- 0
else helpful_votes[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
}
review_titles <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-title']") %>% html_text(trim = TRUE);
reviews <- review_nodes %>% html_nodes(xpath = "./descendant::q[contains(@class, 'location-review-review-list-parts-ExpandableReview__reviewText')]") %>% html_text(trim = TRUE);
pgreviews_df <- data.frame(reviewers, review_dates, stars, contributions, helpful_votes, review_titles, reviews)
reviews_df <- rbind(reviews_df, pgreviews_df);
}