我从我的电子邮件帐户下载了电子邮件文件,将其归类为促销邮件。我正在使用朴素贝叶斯分类器进行分类。 我正在使用下面给出的代码
> classify.trip<- function(path, training.df, prior = 0.5, c=1e-6)
+ {
+ # Here, we use many of the support functions to get the
+ # trip file in a workable format
+ msg <- get.msg(path)
+ msg.tdm <- get.tdm(msg)
+ msg.tdm<-removeSparseTerms(msg.tdm,0.8)
+ msg.freq <- rowSums(as.matrix(msg.tdm))
+ # Find intersections of words
+ msg.match <- intersect(names(msg.freq), training.df$term)
+ # Now, we just perform the naive Bayes calculation
+ if(length(msg.match) < 1)
+ {
+ return(prior * c ^ (length(msg.freq)))
+ }
+ else
+ {
+ match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
+ return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
+ }
+ }
> promo.test <-sapply(ptest.docs,
+ function(p) classify.trip(file.path(ptest.path, p), training.df = promo.df))
> promo.test
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
但它给了我所有文件的概率为零