Clojure垃圾邮件过滤器

时间:2018-12-16 10:18:28

标签: clojure

我正在尝试在Clojure中实现Bayseian分类器。我正在使用集体智慧书作为参考。这是我的代码:

<script src="http://sdks.shopifycdn.com/js-buy-sdk/v1/latest/index.umd.min.js"></script>

我无法弄清楚我的上一个函数分类出了什么问题。当我用参数“ quick money”调用此函数时,我应该将此文档归类为“坏”。但是我得到了:

    (ns spam-filter.model.classifier
    (:require [clojure.string :as st]))

    ;Counts of feature/category combinations
    (def fc (atom {}))
    @fc


    ;Counts of documents in each category
    ;How many times every classification has been used
    (def cc (atom {}))
    @cc

    ;extracts features from the text
    (defn getwords
      "Divides the text on any character that isn't a letter.
      Converted to lowercase"
      [doc]
      (let [words (st/split doc #" ")
            less-than-20 (filter #(< (count %) 20) words)
            final-words (filter #(> (count %) 2) less-than-20)]
        (reduce (fn [final-map word]
                  (assoc final-map (.toLowerCase word) 1))
                {}
                final-words)))


    ;increase the count of a feature/category pair
    (defn incf
      [f cat] 
    (if (not (contains? @fc f))
      (swap! fc #(assoc % f {})))
    (if (not (get-in @fc [f cat]))
      (swap! fc #(assoc-in % [f cat] 0)))
    (swap! fc #(update-in % [f cat] inc)))



    ;increase the count of a category
    (defn incc
     [cat]
    (if (not (contains? @cc cat))
    (swap! cc #(assoc % cat 0)))
    (swap! cc #(update % cat inc)))


    ;The number of times a feature has appeared in a category
    (defn fcount
      [f cat]
    (let [num (get-in @fc [f cat])]
      (if (not (nil? num))
        num
        0.0)))



    ; The number of items in a category
    (defn catcount
      [cat]
    (let [n-of-items (get @cc cat)]
      (if (not (nil? n-of-items))
        n-of-items
        0)))




    ; The total numbers of items
    (defn totalcount
      []
    (reduce + (vals @cc)))



    ; The list of all categories
    (defn categories
    []
    (keys @cc))



    (defn train
      [t cat]
    (incc cat)
    (let [ws (keys (getwords t))]
      (for [w ws] (incf w cat))))



    (defn train1
      [t cat]
      (incc cat)
      (let [features (keys (getwords t))]
        (map incf features (repeat (count features) cat))))



    (defn sampletrain
    []
    [(train "Nobody owns the water." "good")
      (train "the quick rabbit jumps fences" "good")
      (train "buy pharmaceuticals now" "bad")
      (train "make quick money at the online casino" "bad")
      (train "the quick brown fox jumps" "good")])

    @fc
    @cc

    (sampletrain)


    ; probability that a word is in particular category
    ; Pr(word | classification)
    (defn fprob
      [f cat]
    (if (= (catcount cat) 0)
      0
    (float (/ (fcount f cat) (catcount cat)))))

    (fprob "quick" "good")



    ; probability that a word is in particular category
    ; assumed probability 0.5
    (defn weightedprob
      [f cat fprob]
    (let [weight 1
          ap 0.5
          basicprob (fprob f cat)
          totals (reduce + (vals (get @fc f)))
          bp (/ (+ (* weight ap) (* totals basicprob)) (+ weight totals))]
    bp))



    ; Extracts features and multiplies all
    ; their probabilities together to get
    ; an overall probability Pr(Document | Category)
    (defn docprob
      [item cat]
      (let [features (keys (getwords item))]

      (loop [features features
             p 1]
        (if (empty? features)
          p
          (recur
           (rest features)
           (* p (weightedprob (first features) cat fprob)))))))


    ;returns product of Pr(Document | Category) and Pr(Category)
    (defn prob
      [item cat]
      (let [catprob (/ (catcount cat) (totalcount))
            docprob (docprob item cat)]
        (* docprob catprob)))

    (prob "quick rabbit" "good")
    (prob "quick rabbit" "bad")


    (def thresholds (atom {}))


    (defn setthreshold
      [cat t]
    (swap! thresholds #(assoc % cat t)))


    (defn getthreshold
      [cat]
    (if (contains? @thresholds cat)
      (get @thresholds cat)
    1.0))

    (getthreshold "bad")


    (defn classify
      [item]
    (let [probs (atom {})
          max (atom 0.0)
          best (atom nil)]
    (map (fn [cat] ((swap! probs #(assoc % cat (prob item cat)))
                    (when (> (get @probs cat) @max)
                        (swap! max #(let [% (get @probs cat)] %))
                        (swap! best #(let [% cat] %))))) (categories))
    (map (fn [cat] (if (> (* (get @probs cat) (getthreshold @best)) (get @probs @best))
                      nil
                      @best))) (filter #(not= % @best) (categories))))



    (classify "quick money")

1 个答案:

答案 0 :(得分:2)

map是惰性的,因此,如果您在调用它时没有意识到生成的惰性序列,则不会起作用。您正在几个地方这样做。如果您必须针对集合的每个元素调用函数,则仅出于副作用,请考虑改用run!

更一般而言,您的算法实现取决于变异各种全局原子。这在Clojure中是非常普遍的,这使得它难以阅读且难以推理。尝试传入并返回预期状态。这样一来,您就不会遇到map问题。

您的代码中还存在很多与以下形式有关的竞争条件

  (defn incc
     [cat]
    (if (not (contains? @cc cat))
    (swap! cc #(assoc % cat 0))) ; no guarantee that @cc wasn't updated 
                                 ; between here and the contains? line above
    (swap! cc #(update % cat inc)))

使用fnil

可以简化此代码并使其无错误。
(defn incc
     [cat]
     (swap! cc update cat (fnil inc 0)))