R中特定关键词的文本挖掘

时间:2017-11-16 09:36:05

标签: r corpus mining

我正在使用一个代码片来生成转换为语料库的.txt文件中的单词的直方图/频率,但我想搜索特定的关键字并绘制它们的频率...加载所需的包后的以下代码,将文本加载到语料库中,进行预处理/清理并生成10个最常用单词的直方图

  createCorpus <- function(filepath) {
    conn <- file(filepath, "r")
    fulltext <- readLines(conn)
    close(conn)
    vs <- VectorSource(fulltext)
    Corpus(vs, readerControl=list(readPlain, language="en", load=TRUE))
  }

  news_corpus <- createCorpus("sample.txt")
  news_corpus_proc <- tm_map(news_corpus, content_transformer(tolower))
  countWords <- function(filepath, pattern) {
    conn <- file(filepath, "r")
    fulltext <- readLines(conn)
    close(conn)

    count <- 0
    for (i in 1:length(fulltext)) {
      findr <- gregexpr(pattern, fulltext[i])
      if (findr[[1]][1]>0) {
        count <- count + length(findr[[1]])
      }
    }
    count
  }
totwords <- countWords(news_file, " * ") + 10148

mystopwords <- c(" [Aa]nd ", " [Ff]or ", " [Ii]n ", " [Ii]s ", " [Ii]t ",
                 " [Nn]ot ", " [Oo]n ", " [Tt]he ", " [Tt]o ")
totstops <- sum(sapply(mystopwords,
                       function(x) { countWords(news_file, x) }))
totstops/totwords
news_corpus_proc <- tm_map(news_corpus_proc, removeWords,
                           stopwords(kind="en"))
news_corpus_proc <- tm_map(news_corpus_proc, removePunctuation)
news_corpus_proc <- tm_map(news_corpus_proc, removeNumbers)
news_corpus_proc <- tm_map(news_corpus_proc, stripWhitespace)
dtm <- DocumentTermMatrix(news_corpus_proc)
dtm.matrix <- as.matrix(dtm)
wordcount <- colSums(dtm.matrix)
topten <- head(sort(wordcount, decreasing=TRUE), 10)
dfplot <- as.data.frame(melt(topten))
dfplot$word <- dimnames(dfplot)[[1]]
dfplot$word <- factor(dfplot$word,
                      levels=dfplot$word[order(dfplot$value,
                                               decreasing=TRUE)])

fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
fig <- fig + xlab("Word in Corpus")
fig <- fig + ylab("Count")
print(fig)

0 个答案:

没有答案