我创建了一个语料库并使用tm包处理它,下面是一个片段
cleanCorpus<-function(corpus){
corpus.tmp <- tm_map(corpus, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
corpus.tmp <- tm_map(corpus.tmp, removeWords,stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
return(corpus.tmp)
}
myCorpus <-Corpus(VectorSource(Data$body),readerControl = list(reader=readPlain))
cln.corpus<-cleanCorpus(myCorpus)
现在我使用mpqa词典来获取语料库每个文档中的正面词和否定词的总数。
所以我把这份名单作为
pos.words <- lexicon$word[lexicon$Polarity=="positive"]
neg.words <- lexicon$word[lexicon$Polarity=="negative"]
我应该如何将每个文档的内容与正面和负面列表进行比较,并获得每个文档的计数? 我检查了tm词典上的其他帖子,但看起来该功能已被撤销。
答案 0 :(得分:1)
例如
library(tm)
data("crude")
myCorpus <- crude[1:2]
pos.words <- c("advantag", "easy", "cut")
neg.words <- c("problem", "weak", "uncertain")
weightSenti <- structure(function (m) {
m$v <- rep(1, length(m$v))
m$v[rownames(m) %in% neg.words] <- m$v[rownames(m) %in% neg.words] * -1
attr(m, "weighting") <- c("binarySenti", "binSenti")
m
}, class = c("WeightFunction", "function"), name = "binarySenti", acronym = "binSenti")
tdm <- TermDocumentMatrix(cln.corpus, control=list(weighting=weightSenti, dictionary=c(pos.words, neg.words)))
colSums(as.matrix(tdm))
# 127 144
# 2 -2