基本上我有下面提到的代码。 “ install.packages(c(“ggplot2”,“gridExtra”,“igraph”,“Matrix”,“plyr”,“pvclust”,“RColorBrewer”,“rJava”,“slam”,“sna”,“SnowballC”, “stringr”,“tm”,“topicmodels”,“wordcloud”))
library(tm)
library(wordcloud)
library(RColorBrewer)
library(ggplot2)
library(plyr)
library(stringr)
library(gridExtra)
library(Matrix)
# The following four lines will read in a TXT/CSV file
# If R is installed on your PC, use dir <- "C:\\temp", dir <- "C:\\RApps", etc.
dir <- "C:\\RApps"
setwd("C:\\RApps")
file <- "Yelp3.csv"
df <- read.csv(paste(dir, file, sep = "\"), stringsAsFactors = FALSE)
df$text <- sapply(df$text,function(row) iconv(row,to = 'UTF-8'))
df$text<- tolower(df$text)
RunBreenSentimentAnalysis = TRUE
if(RunBreenSentimentAnalysis == TRUE) {
hu.liu.pos=scan('C:/RApps/positive-words.txt',what='character',comment.char=';') #load +ve sentiment word list
hu.liu.neg=scan('C:/RApps/negative-words.txt',what='character',comment.char=';') #load -ve sentiment word list
pos.words=c(hu.liu.pos) # can add terms here e.g. c(hu.liu.pos, 'newterm', 'newterm2')
neg.words=c(hu.liu.neg)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words)
{
word.list = str_split(sentence, '\\s+') # split into words. str_split is in the stringr package
words = unlist(word.list) # sometimes a list() is one level of hierarchy too much
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA. we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
scores <- score.sentiment(df$text,pos.words, neg.words, .progress = 'text')
hist.default(scores$score)
ggplot(scores, aes(x=score)) + geom_histogram(binwidth=1) + xlab("Sentiment score") + ylab("Frequency") + theme_bw() + theme(axis.title.x = element_text(vjust = -0.5, size = 14)) + theme(axis.title.y=element_text(size = 14, angle=90, vjust = -0.25)) + theme(plot.margin = unit(c(1,1,2,2), "cm")) # plots nice histogram
ggsave(file = "OverallSentimentHistogram1.pdf") # export the plot to a PDF file
# Extract documents by sentiment category neutral, positive, negative, very positive, and very negative
scores.neutral<-subset(scores,scores$score==0) # get documents with only neutral scores
scores.pos<-subset(scores,scores$score>=1) # get documents with only positive scores
scores.neg<-subset(scores,scores$score<=-1) # get documents with only negative scores
scores.verypos<-subset(scores,scores$score>=2) # get documents with only very positive scores
scores.veryneg<-subset(scores,scores$score<=-2) # get documents with only very negative scores
# Export data
write.csv(scores,file = "SentimentScores1.csv")
write.csv(scores.neutral,file = "SentimentScores_Neutral1.csv")
write.csv(scores.pos,file = "SentimentScores_Positive1.csv")
write.csv(scores.neg,file = "SentimentScores_Negative1.csv")
write.csv(scores.verypos,file = "SentimentScores_VeryPositive1.csv")
write.csv(scores.veryneg,file = "SentimentScores_VeryNegative1.csv")
} "
我得到的错误是 1. - &gt;阅读2006年的项目 阅读4784项 df $ text中的错误:'closure'类型的对象不是子表 &GT;
首先删除奇怪的字符
来清除Twitter消息df $ text&lt; - sapply(df $ text,function(row)iconv(row,to ='UTF-8')) df $ text中的错误:'closure'类型的对象不是子表 df $ text&lt; - tolower(df $ text) df $ text中的错误:'closure'类型的对象不是可子集化的