我正在使用孟加拉语文本,数据集包含大约9800条推文,Facebook状态(孟加拉语文字混合英语)标有正面负面情绪。
我尝试执行NaïveBayes算法进行文本分类和 其他机器学习算法。
我发现了挑战
所以,使用我的数据集(孟加拉语文本)如何完美地执行朴素贝叶斯算法和其他机器学习算法(决策树,支持向量机)
注意:我正在分享样本数据 https://1drv.ms/x/s!Al917DZ-85m3ghcLoFHX4rWUTFOS
library(tm)
library(e1071)
library(MLmetrics)
rawData <- bntextt
colnames(rawData) <- c("type", "text")
rawData$text <- iconv(rawData$text, to = "utf-8")
rawData$type <- factor(rawData$type)
sms_train_raw <- rawData[1:9800, ]$type
Sms_test_raw<- rawData[9801:9883,]$type
sms_corpus <- Corpus(VectorSource(rawData$text))
corpus <- tm_map(sms_corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords())
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
sms_corpus_train<-corpus[1:9800]
sms_corpus_test<-corpus[9801:9883]
sms_dtm <- DocumentTermMatrix(corpus)
sms_dtm_train <- sms_dtm[1:9800,]
sms_dtm_test <- sms_dtm[9801:9883,]
five_times_words_train <- findFreqTerms(sms_dtm_train, 5)
five_times_words_test <- findFreqTerms(sms_dtm_test, 5)
sms_dtm_train <- DocumentTermMatrix(sms_corpus_train,control=list(dictionary = five_times_words_train))
sms_dtm_test <- DocumentTermMatrix(sms_corpus_test,control=list(dictionary = five_times_words_test))
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
sms_train <- apply(sms_dtm_train, 2, convert_count)
sms_test <- apply(sms_dtm_test, 2, convert_count)
sms_classifier <- naiveBayes(sms_train, sms_train_raw)
class(sms_classifier)
sms_test_pred <- predict(sms_classifier, newdata=sms_test,type="class")
table(Sms_test_raw,sms_test_pred)
Accuracy(sms_test_pred, testData$type)
F1_Score(sms_test_pred, testData$type)