继续我的文本挖掘分析我得到了错误 即我尝试干净的语料库
my.stopwords=read.table("path to my stopword list") stops=as.character(my.stopwords$V1)
generateCorpus <- function(df, stops = c()) {
#corpus <- Corpus(VectorSource(df))
myCorpus <- Corpus(VectorSource(df))
if (Sys.info()['sysname']=="Darwin"){
- myCorpus <- tm_map(myCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
+ myCorpus <- tm_map(myCorpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')))
}
if (Sys.info()['sysname']=="Windows"){
- myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
+ myCorpus <- tm_map(myCorpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
}
# convert to lower case
cat("Converting to lowercase... ")
- myCorpus <- tm_map(myCorpus, tolower)
+ myCorpus <- tm_map(myCorpus, content_transformer(tolower))
cat("done!\n")
# remove numbers
cat("Removing digits and URLs... ")
- myCorpus <- tm_map(myCorpus, removeNumbers)
+ myCorpus <- tm_map(myCorpus, content_transformer(removeNumbers))
# remove URLS
removeURL <- function(x) gsub('"(http.*) |(http.*)$|\n', "", x)
cat("done!\n")
- myCorpus <- tm_map(myCorpus, removeURL)
+ myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
}
my.corpus <- generateCorpus(myCorpus, stops)
结果我收到了这条消息
enc2utf8(x)出错:参数不是字符向量
这是什么意思?
更新
> if (Sys.info()['sysname']=="Windows"){
+ myCorpus <- tm_map( myCorpus, function(x) iconv(enc2utf8(as.character(x)), sub = "byte")
+ )
+ myCorpus <- tm_map(myCorpus, content_transformer(function(x) iconv(enc2utf8(as.character(x)), sub = "byte")))
+ }
Show Traceback
Rerun with Debug
Error in UseMethod("content", x) :
no applicable method for 'content' applied to an object of class "character"
我认为内容中的主要问题
Error in UseMethod("content", x) :
no applicable method for 'content' applied to an object of class "character"
Called from: content(x)
Browse[1]>
function (x)
UseMethod("content", x)
谁决定了这个问题?
答案 0 :(得分:0)
似乎tm_map
不会迭代字符,而是文档。
data(crude)
tm_map(crude, function(x) browser())
然后,检查x,
x
#<<PlainTextDocument>>
#...
enc2utf8(x)
#error in enc2utf8(x) : argumemt is not a character vector
首先应用as.character
就好了,也就是说
myCorpus <- tm_map(
myCorpus,
function(x) {
content(x) <- iconv(enc2utf8(as.character(content(x))), sub = "byte")
return(x)
}
)
可能会成功。
答案 1 :(得分:0)
我在这个网站找到了解决方案 http://www.listendata.com/2014/11/create-wordcloud-with-r.html
我改变了任何字符串,之后,我得到了所需的结果
removeURL <- function(x) gsub('"(http.*) |(http.*)$|\n', "", x)
myCorpus <- tm_map(myCorpus, removeURL)
kb.tf <- list(weighting = weightTf, stopwords = stopwords,
removePunctuation = TRUE,
tolower = TRUE,
minWordLength = 4,
removeNumbers = TRUE, stripWhitespace = TRUE,
stemDocument= TRUE)
myCorpus <- tm_map(myCorpus, PlainTextDocument)
generateCorpus <- function(df, kb.tf = c()) {
myCorpus <- Corpus(VectorSource(df))
}
my.corpus <- generateCorpus(myCorpus, kb.tf)
我想我确实做出了贡献,因为我看到类比帖子,这个问题很热