我正在尝试使用R检查朴素贝叶斯分类的准确性。目前,我可以对数据集进行随机化,并生成50%的数据用于训练和50%的数据用于测试。根据评论,分类器必须预测标签0或1
我需要实施相同的交叉验证,而不是将数据手动划分为训练和测试目的,以便获得10个案例的平均准确性。
,R代码如下:
# Load required libraries
library(tm)
library(RTextTools)
library(e1071)
library(dplyr)
library(caret)
#Library for parallel processing
#install.packages("doMC", repos="http://R-Forge.R-project.org")
library(doMC)
registerDoMC(cores=detectCores()) # Use all available cores
# Load Data
setwd("C:/Users/malsa876/Desktop/RTest")
df<- read.csv("mytracks_NaiveBayes_Filter.csv", stringsAsFactors = FALSE)
glimpse(df)
#Randomizing Dataset
df <- df[sample(nrow(df)), ]
df <- df[sample(nrow(df)), ]
glimpse(df)
# Convert the 'class' variable from character to factor.
df$class <- as.factor(df$Label)
corpus <- Corpus(VectorSource(df$Review))
# Inspect the corpus
corpus
inspect(corpus[1:3])
dtm <- DocumentTermMatrix(corpus)
# Inspect the dtm
inspect(dtm[40:50, 10:15])
#Data Partitinoning
df.train <- df[1:500,]
df.test <- df[501:1000,]
dtm.train <- dtm[1:500,]
dtm.test <- dtm[501:1000,]
corpus.clean.train <- corpus[1:500] #clean
corpus.clean.test <- corpus[501:1000] #clean
dim(dtm.train)
#fivefreq <- findFreqTerms(dtm.train, 2)
#length((fivefreq))
dtm.train.nb <- DocumentTermMatrix(corpus.clean.train)#, control=list(dictionary = fivefreq))
dim(dtm.train.nb)
dtm.test.nb <- DocumentTermMatrix(corpus.clean.test)#, control=list(dictionary = fivefreq))
dim(dtm.train.nb)
# Function to convert the word frequencies to yes (presence) and no (absence) labels
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
trainNB <- apply(dtm.train.nb, 2, convert_count)
testNB <- apply(dtm.test.nb, 2, convert_count)
# Train the classifier
system.time( classifier <- naiveBayes(trainNB, df.train$class, laplace = 1) )
system.time( pred <- predict(classifier, newdata=testNB) )
table("Predictions"= pred, "Actual" = df.test$class)
conf.mat <- confusionMatrix(pred, df.test$class)
conf.mat
conf.mat$byClass
conf.mat$overall
conf.mat$overall['Accuracy']