用于工作的文本分析程序,现在它不

时间:2017-06-06 16:47:58

标签: text rweka

我写了一个下面显示的文件(来自臭名昭着的Coursera课程以及其他课程),它对我很有帮助。不确定某些事情是否发生了变化,但现在似乎没有用,我什么都没改变。

第一件似乎不起作用的是for循环删除特殊字符。

接下来,当我将其视为计划文本文档时,单词云似乎不会起作用。

最后,标记生成器函数生成相同的图表,基本上是经常使用的单个单词与编程后的ngrams。意思是每个ngram只生成相同的图表,最常用的单词与2,3,4单词的ngram等等...

不确定软件包更新或R更新是否导致此问题。

有什么想法吗?

#Set working directory and read file
cname <- file.path( "c:/texts")   
cname   
dir(cname) 
setwd("c:/texts")

library("RColorBrewer")
library("tm")
library("knitr")
library("devtools")
library("plyr")
library("ggplot2")
library("wordcloud")
library("rJava")
library("RWeka")
library("stringi")
library("XLConnect")
library("XLConnectJars")

df<- readWorksheetFromFile("uars.xlsx", sheet=1, startRow=1)
df1 <- df[df$Business %in% "FRAUD", ]

#Load the R package for text mining and then load your texts into R.
library(tm)  

docs <- Corpus(VectorSource(df1))   

summary(docs)   

#read your documents in the R terminal using 
inspect(docs) 

#Preprocessing

#Removing punctuation
docs <- tm_map(docs, removePunctuation) 

# remove special characters.
for(j in seq(docs))   
{   
  docs[[j]] <- gsub("/", " ", docs[[j]])   
  docs[[j]] <- gsub("@", " ", docs[[j]])   
  docs[[j]] <- gsub("\\|", " ", docs[[j]])   
}   

#Removing numbers:
docs <- tm_map(docs, removeNumbers)  

#Converting to lowercase:
docs <- tm_map(docs, tolower)  

#Removing "stopwords" (common words) that usually have no analytic value
docs <- tm_map(docs, removeWords, c(stopwords("english"), "bank", "account", "customer", "transactions", "sent", "received", "company", 
                                    "wire", "wires", "payment", "payments", "wells", "fargo", "transaction", "fraud", "wholesale", "wholesal", "uar", "email"))
#Removing common word endings (e.g., "ing", "es", "s")
library(SnowballC)   
docs <- tm_map(docs, stemDocument) 

#Stripping unnecesary whitespace from your documents:
docs <- tm_map(docs, stripWhitespace)   

#treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)   

#Stage the Data

#To proceed, create a document term matrix
dtm <- DocumentTermMatrix(docs)   
dtm
inspect(dtm)

#transpose of this matrix
tdm <- TermDocumentMatrix(docs)
tdm

##TCorpus <- tm_map(TCorpus, removeWords, badWords)
wordcloud(docs, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE,
          rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)
#Tokenizer functions
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
quadgram <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
fivegram <- function(x) NGramTokenizer (x, Weka_control(min=5, max=5))
sixgram <- function(x) NGramTokenizer (x, Weka_control(min=6, max=6))

#Word/phrase count function
freq_df <- function(tdm){
  # Helper function to tabulate frequency
  freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
  freq_df <- data.frame(word=names(freq), freq=freq)
  return(freq_df)
}

#Creating the n-grams
corpus.unigram <- TermDocumentMatrix(docs)
corpus.unigram <- removeSparseTerms(corpus.unigram, 0.99)
corpus.unigram.freq <- freq_df(corpus.unigram)

corpus.bigram <- TermDocumentMatrix(docs, control=list(tokenize=bigram))
corpus.bigram <- removeSparseTerms(corpus.bigram, 0.999)
corpus.bigram.freq <- freq_df(corpus.bigram)

corpus.trigram <- TermDocumentMatrix(docs, control=list(tokenize=trigram))
corpus.trigram <- removeSparseTerms(corpus.trigram, 0.99)
corpus.trigram.freq <- freq_df(corpus.trigram)

corpus.quadgram <- TermDocumentMatrix(docs, control=list(tokenize=quadgram))
corpus.quadgram <- removeSparseTerms(corpus.quadgram, 0.9999)
corpus.quadgram.freq <- freq_df(corpus.quadgram)

corpus.fivegram <- TermDocumentMatrix(docs, control=list(tokenize=fivegram))
corpus.fivegram <- removeSparseTerms(corpus.fivegram, 0.9999)
corpus.fivegram.freq <- freq_df(corpus.fivegram)

corpus.sixgram <- TermDocumentMatrix(docs, control=list(tokenize=sixgram))
corpus.sixgram <- removeSparseTerms(corpus.sixgram, 0.9999)
corpus.sixgram.freq <- freq_df(corpus.sixgram)

top_50 <- function(df1, title, color) {
  ggplot(df[1:50,], aes(x = seq(1:50), y = freq)) +
    geom_bar(stat = "identity", fill = color, colour = "black", width = 0.80) +
    coord_cartesian(xlim = c(0, 51)) +
    labs(title = title) +
    xlab("Words") +
    ylab("Count") +
    scale_x_continuous(breaks = seq(1, 50, by = 1), labels = df$word[1:50]) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}


top_50(corpus.unigram.freq,"Top 50 words","green")
top_50(corpus.bigram.freq,"Top 2 word combos","yellow")
top_50(corpus.trigram.freq,"Top 3 word combos","orange")
top_50(corpus.quadgram.freq,"Top 4 word combos","red")
top_50(corpus.fivegram.freq,"Top 5 word combos","blue")
top_50(corpus.sixgram.freq,"Top 6 word combos","purple")

1 个答案:

答案 0 :(得分:0)

我发现了这个问题。在上面的代码中,我使用的是一个简单的语料库(Corpus),它不能与Rweka(Weka_control)中的控件一起使用。我切换到了VCorpus,一切都很顺利。