我已经和NaiveBayes分类了好几个小时了。标记为TrainData.tsv的文件由3列和25000行组成。列是id(chr),sentiment(int)和review(chr)。
我试着预测测试数据的情绪,但无论我做什么,它都会返回50/50的猜测。产生的真值表总是如下:
prediction 0 1
0 199 201
1 0 0
我做错了什么?代码
library(e1071)
library(tm)
trainData <- read.table("data/labeledTrainData.tsv", header = TRUE, stringsAsFactors = FALSE)
trainData <- trainData[sample(nrow(trainData)),]
range.train <- c(1:1600)
range.test <- c(1601:2000)
df <- trainData[1:20000,]
df$sentiment <- as.factor(df$sentiment)
df.train <- df[range.train,]
df.test <- df[range.test,]
corpus <- Corpus(VectorSource(df$review))
corpus.clean <- cleanText(corpus)
dtm <- DocumentTermMatrix(corpus.clean)
trainNB <- as.matrix(dtm[range.train,])
testNB <- as.matrix(dtm[range.test,])
system.time( classifier <- naiveBayes(trainNB, df.train$sentiment) )
system.time( prediction <- predict(classifier, testNB[,-1]) )
table(prediction, df.test$sentiment)
convert_count <- function(x) {
y <- ifelse(x > 0, 1, 0) # 1 might have to be changed to x
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
}
cleanText <- function(text) {
text %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords('en')) %>%
tm_map(toSpace, "<br />") %>%
tm_map(toSpace, "/") %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stemDocument)
}
toSpace <- content_transformer(function(x, pattern) {
return (
gsub(pattern, " ", x)
)
})