我正在尝试使用tm包在R上开发基本文本分析。
输入文件:csv文件结合几家酒店的评论
我已经导入它并通过tm包提供的转换实现了一些数据清理任务。
然后,当我使用以下脚本创建文档术语矩阵时:
DocumentTermMatrix(tm_map(reviewc, PlainTextDocument))
我得到的是一个没有任何单词但没有任何意义的字符的矩阵:
inspect(try[1:5, 200:500])
<<DocumentTermMatrix (documents: 5, terms: 301)>>
Non-/sparse entries: 0/1505
Sparsity : 100%
Maximal term length: 25
Weighting : term frequency (tf)
Terms
Docs “extensiveâ€\u009d “extraâ€\u009d “finest “freeâ€\u009d “fromâ€\u009d “funkyâ€\u009d “goodâ€\u009d “half
character(0) 0 0 0 0 0 0 0 0
character(0) 0 0 0 0 0 0 0 0
character(0) 0 0 0 0 0 0 0 0
character(0) 0 0 0 0 0 0 0 0
character(0) 0 0 0 0 0 0 0 0
任何人都知道我该怎么做才能避免这个错误?
提前感谢您的帮助
干杯!
答案 0 :(得分:1)
library(tm)
library(SnowballC)
library(ggplot2)
library(FactoMineR)
library(RColorBrewer)
library(ape)
library(wordcloud)
library(stringr)
beijing_review <- read.csv("~/Downloads/beijing_review.csv", sep=";", comment.char="#")
# Remove this words
cleanwords = c("germany","alemania","bravcger", "\U0001f604\U0001f60a\U0001f44d\U0001f44d") ## Remove words
tryTolower = function(x)
{
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
clean.text = function(x)
{
# tolower
x = tryTolower(x)
# remove rt
x = gsub("rt ", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
x = str_replace_all(x, "[^[:alnum:]]", " ")
#return(x)
}
texto_c = clean.text(beijing_review$text) # Get column text
texto_ac= paste(texto_c, collapse=" ")
rmNonAlphabet <- function(str) {
words <- unlist(strsplit(str, " "))
in.alphabet <- grep(words, pattern = "[a-z]", ignore.case = T)
nice.str <- paste(words[in.alphabet], collapse = " ")
nice.str
}
texto_ac = rmNonAlphabet(texto_ac)
busca_corpus = Corpus(VectorSource(texto_ac))
tdm = TermDocumentMatrix(busca_corpus,
control = list(removePunctuation = TRUE,
stopwords = c(cleanwords,stopwords("english"),stopwords("spanish"),stopwords("portuguese"),cleanwords),
removeNumbers = TRUE, tryTolower = TRUE, stopwords=TRUE))
m = as.matrix(tdm)
palavras_freqs = sort(rowSums(m), decreasing=TRUE) # Contagem das palavras e ordenação
dm= data.frame(word=names(palavras_freqs), freq=palavras_freqs)
dtm = DocumentTermMatrix(busca_corpus)
dtm_matrix = as.matrix(dtm)
top_palavras = head(palavras_freqs, 30) # nesse caso 10 usuários que mais tweetaram
barplot(top_palavras, border=NA, las=1, main="30 Top Words", xlab="# of Rep", cex.main=1, horiz=TRUE, cex.names=0.65, axis.lty=1)
# Plot WordCloud - Max word =100 and Freq >= 50
wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq=50,colors=brewer.pal(8, "Dark2"), max.words = 100)