我实际上知道如何做,但是我想知道是否可以用最少的编码工作。如果未显示全部评论页,如何将其全部刮掉。
该网站为here。这是关于客户对turkcell gsm运营商的投诉。我的代码如下所示:
library(rvest); library(tidyverse)
url <- "https://www.sikayetvar.com/turkcell"
reviews <- read_html(url) %>%
html_nodes("a.complaint-link-for-ads") %>%
html_text()
reviews <- textclean::replace_non_ascii(reviews) #turkish characters
reviews <- tolower(reviews)
reviews <- gsub("[[:punct:]]","",reviews)
reviews <- gsub(" ","-",reviews)
reviews <- stringr::str_c(url,"/",reviews)
get_reviews <- function(master_df){
as.data.frame(
read_html(master_df) %>%
html_nodes("div.description") %>%
html_text()
)
}
reviews_full <- reviews %>%
map(get_reviews) %>%
bind_rows() %>%
as_tibble()
colnames(reviews_full) <- "reviews"
reviews_full <- as_tibble(gsub('\\n+','',reviews_full$reviews))
reviews_full[1:2,]
谢谢。