我正在尝试将RSentiment :: calculate_score()应用于存储在data.frame中的一组句子。以下是我获取数据的方式:
install.packages("pacman")
pacman::p_load(XML, dplyr, tidyr, stringr, rvest, audio, xml2, purrr, tidytext, ggplot2)
sapiens_code = "1846558239"
deus_ex_code = "1910701874"
function_product <- function(prod_code){
url <- paste0("https://www.amazon.co.uk/dp/",prod_code)
doc <- xml2::read_html(url)
prod <- html_nodes(doc,"#productTitle") %>% html_text() %>%
gsub("\n","",.) %>%
gsub("^\\s+|\\s+$", "", .) #Remove all white space
prod
}
sapiens <- function_product(sapiens_code)
deus_ex <- function_product(deus_ex_code)
#Source function to Parse Amazon html pages for data
source("https://raw.githubusercontent.com/rjsaito/Just-R-Things/master/Text%20Mining/amazonscraper.R")
# extracting reviews
pages <- 13
function_page <- function(page_num, prod_code){
url2 <- paste0("http://www.amazon.co.uk/product-reviews/",prod_code,"/?pageNumber=", page_num)
doc2 <- read_html(url2)
reviews <- amazon_scraper(doc2, reviewer = F, delay = 2)
reviews
}
sapiens_reviews <- map2(1:pages, sapiens_code, function_page) %>% bind_rows()
deusex_reviews <- map2(1:pages, deus_ex_code, function_page) %>% bind_rows()
sapiens_reviews$comments <- gsub("\\.", "\\. ", sapiens_reviews$comments)
deusex_reviews$comments <- gsub("\\.", "\\. ", deusex_reviews$comments)
sentence_function <- function(df){
df_sentence <- df %>%
select(comments, format, stars, helpful) %>%
unnest_tokens(sentence, comments, token = "sentences")
df_sentence
}
sapiens_sentence <- sentence_function(sapiens_reviews)
deusex_sentence <- sentence_function(deusex_reviews)
但是当我尝试为他们分配分数时,我收到一个错误:
deusex_sentence <- deusex_sentence %>%
mutate(sentence_score <- unname(calculate_score(sentence)))
错误:参数意味着行数不同:34,33
我看不到输入格式的任何根本错误,随机选择的句子的输出似乎很好,例如。
unname(calculate_score(sapiens_sentence[1, 4]))
[1] -1
任何想法如何解决这个问题?非常感谢你的帮助!
答案 0 :(得分:1)
事实证明,问题是由句子中的特殊字符引起的。删除后,我可以成功运行情绪分析(我在函数中包含了数据清理步骤):
sentence_function <- function(df){
df_sentence <- df %>%
select(comments, format, stars, helpful) %>%
unnest_tokens(sentence, comments, token = "sentences") %>%
mutate(sentence2 = str_replace_all(sentence, "[^[:alnum:]]", " ")) #removing all special characters
df_sentence <- df_sentence %>%
mutate(sentence_score = unname(calculate_score(sentence2)))
df_sentence
}
# go and get a hot drink while this is running
sapiens_sentence <- sentence_function(sapiens_reviews)
deusex_sentence <- sentence_function(deusex_reviews)