# init
libs <- c("tm", "plyr", "class", "RTextTools", "randomForest")
lapply(libs, require, character.only = TRUE)
# set options
options(stringsAsFactors = FALSE)
# set parameters
labels <- read.table('labels.txt')
path <- paste(getwd(), "/data", sep="")
# clean text
cleanCorpus <- function(corpus) {
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, stemDocument, language = "english")
corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
return(corpus.tmp)
}
# build TDM
generateTDM <- function(label, path) {
s.dir <- sprintf("%s/%s", path, label)
s.cor <- Corpus(DirSource(directory = s.dir), readerControl = list(language = "en"))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.7)
return(list(name = label, tdm = s.tdm))
}
tdm <- lapply(labels, generateTDM, path = path)
# attach name
bindLabelToTDM <- function(tdm) {
s.mat <- t(data.matrix(tdm[["tdm"]]))
s.df <- as.data.frame(s.mat, stringsAsFactors = FALSE)
s.df <- cbind(s.df, rep(tdm[["name"]], nrow(s.df)), row.names = NULL)
colnames(s.df)[ncol(s.df)] <- "targetlabel"
return(s.df)
}
labelTDM <- lapply(tdm, bindLabelToTDM)
# stack
tdm.stack <- do.call(rbind.fill, labelTDM)
tdm.stack[is.na(tdm.stack)] <- 0
# hold-out
train.idx <- sample(nrow(tdm.stack), ceiling(nrow(tdm.stack) * 0.7))
test.idx <- (1:nrow(tdm.stack)) [- train.idx]
tdm.lab <- tdm.stack[, "targetlabel"]
tdm.stack.nl <- tdm.stack[, !colnames(tdm.stack) %in% "targetlabel"]
train <- tdm.stack[train.idx, ]
test <- tdm.stack[test.idx, ]
train$targetlabel <- as.factor(train$targetlabel)
label.rf <- randomForest(targetlabel ~ ., data = train, ntree = 5000, mtry = 15, importance = TRUE)
我正在尝试使用randomForest算法对文本文件进行多类分类。我得到的错误可能是因为最后一行或倒数第二行。
Error in eval(expr, envir, enclos) : object '∗' not found
tdm.stack包含名称为文档中找到的单词的列,其单元格值为其频率。最后一列包含类值。
我已经尝试了一切我无法弄清楚问题的方法。请帮忙。
答案 0 :(得分:0)
错误是由我的语料库中存在非ASCII字符引起的。 我将此行添加到我的cleanCorpus函数中以删除非ASCII字符
corpus.tmp <- tm_map(corpus.tmp, function(x) iconv(x, "latin1", "ASCII", sub=""))
这解决了这个问题。