我正在使用一个代码片来生成转换为语料库的.txt文件中的单词的直方图/频率,但我想搜索特定的关键字并绘制它们的频率...加载所需的包后的以下代码,将文本加载到语料库中,进行预处理/清理并生成10个最常用单词的直方图
createCorpus <- function(filepath) {
conn <- file(filepath, "r")
fulltext <- readLines(conn)
close(conn)
vs <- VectorSource(fulltext)
Corpus(vs, readerControl=list(readPlain, language="en", load=TRUE))
}
news_corpus <- createCorpus("sample.txt")
news_corpus_proc <- tm_map(news_corpus, content_transformer(tolower))
countWords <- function(filepath, pattern) {
conn <- file(filepath, "r")
fulltext <- readLines(conn)
close(conn)
count <- 0
for (i in 1:length(fulltext)) {
findr <- gregexpr(pattern, fulltext[i])
if (findr[[1]][1]>0) {
count <- count + length(findr[[1]])
}
}
count
}
totwords <- countWords(news_file, " * ") + 10148
mystopwords <- c(" [Aa]nd ", " [Ff]or ", " [Ii]n ", " [Ii]s ", " [Ii]t ",
" [Nn]ot ", " [Oo]n ", " [Tt]he ", " [Tt]o ")
totstops <- sum(sapply(mystopwords,
function(x) { countWords(news_file, x) }))
totstops/totwords
news_corpus_proc <- tm_map(news_corpus_proc, removeWords,
stopwords(kind="en"))
news_corpus_proc <- tm_map(news_corpus_proc, removePunctuation)
news_corpus_proc <- tm_map(news_corpus_proc, removeNumbers)
news_corpus_proc <- tm_map(news_corpus_proc, stripWhitespace)
dtm <- DocumentTermMatrix(news_corpus_proc)
dtm.matrix <- as.matrix(dtm)
wordcount <- colSums(dtm.matrix)
topten <- head(sort(wordcount, decreasing=TRUE), 10)
dfplot <- as.data.frame(melt(topten))
dfplot$word <- dimnames(dfplot)[[1]]
dfplot$word <- factor(dfplot$word,
levels=dfplot$word[order(dfplot$value,
decreasing=TRUE)])
fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
fig <- fig + xlab("Word in Corpus")
fig <- fig + ylab("Count")
print(fig)