R朴素贝叶斯文本分类问题

时间:2015-12-12 17:28:40

标签: r machine-learning naivebayes

我们正在使用R朴素贝叶斯进行文本分类。结果与手工计算不同。也许R正在执行一些规范化,分配并且不能在多项式模式下工作(带有频率的文本字)。另外我无法理解R naive byes计算机先验条件概率[,1] [,2]。它与计算值不同。

`computeNavieBayes=function(trainingDataPath,testData,isTrainingMode) {
out <- tryCatch(
{
    library(tm)
    library(e1071)

    testDataTokens <-unlist(strsplit(testData, "[,]"))
    dataText<-read.csv(trainingDataPath,header= TRUE,row.names=NULL)
    trainvector <- as.vector(dataText$Text)
    trainsource <- VectorSource(trainvector)
    traincorpus <- Corpus(trainsource)

    #REMOVE STOPWORDS
   traincorpus <- tm_map(traincorpus,stripWhitespace)
   traincorpus <- tm_map(traincorpus,tolower)
   traincorpus <- tm_map(traincorpus, removeWords,stopwords("english"))
   traincorpus<- tm_map(traincorpus,removePunctuation)
   traincorpus <- tm_map(traincorpus, PlainTextDocument)

  # CREATE TERM DOCUMENT MATRIX
  trainmatrix <- t(TermDocumentMatrix(traincorpus))
  model <-           
 naiveBayes(as.matrix(trainmatrix),dataText$Category,
 type="raw",laplace=1,useKernel=FALSE)
 print(model)
 col1 <- c()
 index <- 1
resultsColl <- vector()
for (valueToken in testDataTokens)
{
   col1[1] <- valueToken
   dataTest <- data.frame("col1"=col1)
   testvector <- as.vector(dataTest)
   testsource <- VectorSource(testvector)
   testcorpus <- Corpus(testsource)
   testcorpus <- tm_map(testcorpus,stripWhitespace)
   testcorpus <- tm_map(testcorpus,tolower)
   testcorpus <- tm_map(testcorpus, removeWords,stopwords("english"))
   testcorpus<- tm_map(testcorpus,removePunctuation)
   testcorpus <- tm_map(testcorpus, PlainTextDocument)
   testmatrix <- t(TermDocumentMatrix(testcorpus))
   print(testmatrix)
   print(valueToken)
   results<-predict(model, as.matrix(testmatrix),type="raw",laplace=1)
   print(class(results))
   print(typeof(results))
   print(results)

   resultsColl[index] <- toString(results)
  index <- index +1

 }
 return (resultsColl)
}, 
 error=function(cond)
{
  #error(RML,cond)
},
warning=function(cond) 
{ 
  return(cond)
},
finally={
}
)
return(out)
}


# function call
result<- computeNavieByes("c:/software/nb.csv","laundering terrorist terrorist","N") 
print(result)

`

# training data
Text,Category
laundering laundering laundering,Money laundering
bankfraud bankfraud,Money laundering
terrorist terrorist terrorist terrorist,Terrorist Financing
weapon weapon,Terrorist Financing
bribe bribe bribe bribe bribe,Bribery and Corruption
corrupt corrupt corrupt,Bribery and Corruption

TestData
"laundering terrorist terrorist"

 **R Naive Byes calculations**
 class prior probabilities 
 A-priori probabilities:
 dataText$Category
 Bribery and Corruption       Money laundering    Terrorist Financing
         0.3333333              0.3333333              0.3333333



 Conditional probabilities:
                    bankfraud
 dataText$Category        [,1]     [,2]
 Bribery and Corruption    0 0.000000
 Money laundering          1 1.414214
 Terrorist Financing       0 0.000000

                    bribe
 dataText$Category        [,1]     [,2]
 Bribery and Corruption  2.5 3.535534
 Money laundering        0.0 0.000000
 Terrorist Financing     0.0 0.000000

                    corrupt
 dataText$Category        [,1]    [,2]
 Bribery and Corruption  1.5 2.12132
 Money laundering        0.0 0.00000
 Terrorist Financing     0.0 0.00000

                    laundering
 dataText$Category        [,1]    [,2]
 Bribery and Corruption  0.0 0.00000
 Money laundering        1.5 2.12132
 Terrorist Financing     0.0 0.00000

                    terrorist
 dataText$Category        [,1]     [,2]
 Bribery and Corruption    0 0.000000
 Money laundering          0 0.000000
 Terrorist Financing       2 2.828427

                    weapon
 dataText$Category        [,1]     [,2]
 Bribery and Corruption    0 0.000000
 Money laundering          0 0.000000
 Terrorist Financing       1 1.414214



        Bribery and Corruption Money laundering Terrorist Financing
 [1,]            0.003077316        0.5628753           0.4340474

 R naive byes classifies test data as "Money Laundering"

 ------------------------------------------------------------------------

 Hand Computed Values

 |v| = unique number of words in vocabulary=laundering,
   bankfraud,terrorist,weapon,bribe,corrupt = 6
 laplace smoothing = 1

 (1) class prior probabilities   
 Money laundering = 2/6 = 1/3 = 0.3333
 Terrorist Financing = 2/6 = 1/3 = 0.3333
 Bribery and Corruption = 2/6 =1/3 = 0.3333


 (2) prior conditional probabilities
 p(laundering|Money laundering) = (3+1) / (5 + 6) = 4/11 =  0.3636
 p(bankfraud|Money laundering) =  (2 +1) / (5 +6) = 3/11=   0.2727
 p(terrorist|Money laundering) =    (0 +1) / (5+6) = 1/11=    0.0909
 p(weapon|Money laundering) =     (0 +1) / (5+6) = 1/11=     0.0909
 p(bribe|Money laundering) =         (0 +1) / (5+6) = 1/11=     0.0909
 p(corrupt|Money laundering) =     (0 +1) / (5+6) = 1/11=      0.0909

 p(laundering|Terrorist Financing) = (0 +1) / (6+6) = 1/12 =  0.0833
 p(bankfraud|Terrorist Financing) = (0 +1) / (6+6) = 1/12 =   0.0833
 p(terrorist|Terrorist Financing) =    (4+1) / (6+6) = 5/12 =    0.4166
 p(weapon|Terrorist Financing) =     (2+1) / (6+6) = 3/12 =     0.25
 p(bribe|Terrorist Financing) =         (0 +1) / (6+6) = 1/12 =  0.0833 
 p(corrupt|Terrorist Financing) =     (0 +1) / (6+6) = 1/12 =  0.0833

 p(laundering|Bribery and Corruption) = (0+1) / (8+6) = 1/14 =  0.0714
 p(bankfraud|Bribery and Corruption) = (0+1) / (8+6) = 1/14 = 0.0714
 p(terrorist|Bribery and Corruption) = (0+1) / (8+6) = 1/14 =0.0714
 p(weapon|Bribery and Corruption) = (0+1) / (8+6) = 1/14 =0.0714
 p(bribe|Bribery and Corruption) =   (5+1) /  (8+6) = 6/14 = 0.4285
 p(corrupt|Bribery and Corruption) = (3 +1) / (8+6) = 4/14 = 0.2857


 (3)posterior class probabilities
 Test data -> laundering terrorist terrorist

 p(Money laundering|test data) = 0.3333 * 0.3636 * 0.0909 * 0.0909  =
 0.0010013524

 p(Terrorist Financing|test data) = 0.3333 * 0.0833 * 0.4166* 0.4166 =
 0.0048185774

 p(Bribery and Corruption|test data) = 0.3333 * 0.0714 * 0.0714 * 0.0714 =
 0.0001213193

 Nornalized values
 p(Money laundering|test data) =0.0010013524 / 0.0059412491 = 0.168542
 p(Terrorist Financing|test data) = 0.0048185774  / 0.0059412491 = 0.811037
 p(Bribery and Corruption|test data) =  0.0001213193  / 0.0059412491 =
 0.020419

 Hand calculated  naive byes classifies test data as "Terrorist Financing"
 (which is correct) but R classifies as "Money laundering" (which is wrong)

0 个答案:

没有答案