我们正在使用R朴素贝叶斯进行文本分类。结果与手工计算不同。也许R正在执行一些规范化,分配并且不能在多项式模式下工作(带有频率的文本字)。另外我无法理解R naive byes计算机先验条件概率[,1] [,2]。它与计算值不同。
`computeNavieBayes=function(trainingDataPath,testData,isTrainingMode) {
out <- tryCatch(
{
library(tm)
library(e1071)
testDataTokens <-unlist(strsplit(testData, "[,]"))
dataText<-read.csv(trainingDataPath,header= TRUE,row.names=NULL)
trainvector <- as.vector(dataText$Text)
trainsource <- VectorSource(trainvector)
traincorpus <- Corpus(trainsource)
#REMOVE STOPWORDS
traincorpus <- tm_map(traincorpus,stripWhitespace)
traincorpus <- tm_map(traincorpus,tolower)
traincorpus <- tm_map(traincorpus, removeWords,stopwords("english"))
traincorpus<- tm_map(traincorpus,removePunctuation)
traincorpus <- tm_map(traincorpus, PlainTextDocument)
# CREATE TERM DOCUMENT MATRIX
trainmatrix <- t(TermDocumentMatrix(traincorpus))
model <-
naiveBayes(as.matrix(trainmatrix),dataText$Category,
type="raw",laplace=1,useKernel=FALSE)
print(model)
col1 <- c()
index <- 1
resultsColl <- vector()
for (valueToken in testDataTokens)
{
col1[1] <- valueToken
dataTest <- data.frame("col1"=col1)
testvector <- as.vector(dataTest)
testsource <- VectorSource(testvector)
testcorpus <- Corpus(testsource)
testcorpus <- tm_map(testcorpus,stripWhitespace)
testcorpus <- tm_map(testcorpus,tolower)
testcorpus <- tm_map(testcorpus, removeWords,stopwords("english"))
testcorpus<- tm_map(testcorpus,removePunctuation)
testcorpus <- tm_map(testcorpus, PlainTextDocument)
testmatrix <- t(TermDocumentMatrix(testcorpus))
print(testmatrix)
print(valueToken)
results<-predict(model, as.matrix(testmatrix),type="raw",laplace=1)
print(class(results))
print(typeof(results))
print(results)
resultsColl[index] <- toString(results)
index <- index +1
}
return (resultsColl)
},
error=function(cond)
{
#error(RML,cond)
},
warning=function(cond)
{
return(cond)
},
finally={
}
)
return(out)
}
# function call
result<- computeNavieByes("c:/software/nb.csv","laundering terrorist terrorist","N")
print(result)
`
# training data
Text,Category
laundering laundering laundering,Money laundering
bankfraud bankfraud,Money laundering
terrorist terrorist terrorist terrorist,Terrorist Financing
weapon weapon,Terrorist Financing
bribe bribe bribe bribe bribe,Bribery and Corruption
corrupt corrupt corrupt,Bribery and Corruption
TestData
"laundering terrorist terrorist"
**R Naive Byes calculations**
class prior probabilities
A-priori probabilities:
dataText$Category
Bribery and Corruption Money laundering Terrorist Financing
0.3333333 0.3333333 0.3333333
Conditional probabilities:
bankfraud
dataText$Category [,1] [,2]
Bribery and Corruption 0 0.000000
Money laundering 1 1.414214
Terrorist Financing 0 0.000000
bribe
dataText$Category [,1] [,2]
Bribery and Corruption 2.5 3.535534
Money laundering 0.0 0.000000
Terrorist Financing 0.0 0.000000
corrupt
dataText$Category [,1] [,2]
Bribery and Corruption 1.5 2.12132
Money laundering 0.0 0.00000
Terrorist Financing 0.0 0.00000
laundering
dataText$Category [,1] [,2]
Bribery and Corruption 0.0 0.00000
Money laundering 1.5 2.12132
Terrorist Financing 0.0 0.00000
terrorist
dataText$Category [,1] [,2]
Bribery and Corruption 0 0.000000
Money laundering 0 0.000000
Terrorist Financing 2 2.828427
weapon
dataText$Category [,1] [,2]
Bribery and Corruption 0 0.000000
Money laundering 0 0.000000
Terrorist Financing 1 1.414214
Bribery and Corruption Money laundering Terrorist Financing
[1,] 0.003077316 0.5628753 0.4340474
R naive byes classifies test data as "Money Laundering"
------------------------------------------------------------------------
Hand Computed Values
|v| = unique number of words in vocabulary=laundering,
bankfraud,terrorist,weapon,bribe,corrupt = 6
laplace smoothing = 1
(1) class prior probabilities
Money laundering = 2/6 = 1/3 = 0.3333
Terrorist Financing = 2/6 = 1/3 = 0.3333
Bribery and Corruption = 2/6 =1/3 = 0.3333
(2) prior conditional probabilities
p(laundering|Money laundering) = (3+1) / (5 + 6) = 4/11 = 0.3636
p(bankfraud|Money laundering) = (2 +1) / (5 +6) = 3/11= 0.2727
p(terrorist|Money laundering) = (0 +1) / (5+6) = 1/11= 0.0909
p(weapon|Money laundering) = (0 +1) / (5+6) = 1/11= 0.0909
p(bribe|Money laundering) = (0 +1) / (5+6) = 1/11= 0.0909
p(corrupt|Money laundering) = (0 +1) / (5+6) = 1/11= 0.0909
p(laundering|Terrorist Financing) = (0 +1) / (6+6) = 1/12 = 0.0833
p(bankfraud|Terrorist Financing) = (0 +1) / (6+6) = 1/12 = 0.0833
p(terrorist|Terrorist Financing) = (4+1) / (6+6) = 5/12 = 0.4166
p(weapon|Terrorist Financing) = (2+1) / (6+6) = 3/12 = 0.25
p(bribe|Terrorist Financing) = (0 +1) / (6+6) = 1/12 = 0.0833
p(corrupt|Terrorist Financing) = (0 +1) / (6+6) = 1/12 = 0.0833
p(laundering|Bribery and Corruption) = (0+1) / (8+6) = 1/14 = 0.0714
p(bankfraud|Bribery and Corruption) = (0+1) / (8+6) = 1/14 = 0.0714
p(terrorist|Bribery and Corruption) = (0+1) / (8+6) = 1/14 =0.0714
p(weapon|Bribery and Corruption) = (0+1) / (8+6) = 1/14 =0.0714
p(bribe|Bribery and Corruption) = (5+1) / (8+6) = 6/14 = 0.4285
p(corrupt|Bribery and Corruption) = (3 +1) / (8+6) = 4/14 = 0.2857
(3)posterior class probabilities
Test data -> laundering terrorist terrorist
p(Money laundering|test data) = 0.3333 * 0.3636 * 0.0909 * 0.0909 =
0.0010013524
p(Terrorist Financing|test data) = 0.3333 * 0.0833 * 0.4166* 0.4166 =
0.0048185774
p(Bribery and Corruption|test data) = 0.3333 * 0.0714 * 0.0714 * 0.0714 =
0.0001213193
Nornalized values
p(Money laundering|test data) =0.0010013524 / 0.0059412491 = 0.168542
p(Terrorist Financing|test data) = 0.0048185774 / 0.0059412491 = 0.811037
p(Bribery and Corruption|test data) = 0.0001213193 / 0.0059412491 =
0.020419
Hand calculated naive byes classifies test data as "Terrorist Financing"
(which is correct) but R classifies as "Money laundering" (which is wrong)