Haskell版本的Peter Norvig的拼写纠正器速度令人难以置信

时间:2015-09-14 17:35:30

标签: haskell dictionary text

作为练习,我在Haskell写了Peter Norvig的spelling corrector algorithm

module Spl (nwords, correct)
    where

import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))

import qualified Data.Set as Set
import qualified Data.Map.Strict as Map

type NWords = Map.Map String Int


alphabet :: String
alphabet = enumFromTo 'a' 'z'

nwords :: String -> Map.Map String Int
nwords = train . words'

uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
    where uniq' _ [] = []
          uniq' seen (x:xs)
            | x `Set.member` seen = uniq' seen xs
            | otherwise           = x:uniq' (x `Set.insert` seen) xs

words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower

train :: [String] -> NWords
train = foldl' populate Map.empty
    where populate m feature = Map.insertWith (+) feature 1 m

edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
    where dels   = [a ++ tail b | (a,b) <- splits, nn b]
          trans  = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
                   , length b > 1]
          repl   = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
          ins    = [a ++ c:b | (a,b) <- splits, c <- alphabet]
          splits = [splitAt n word | n <- [0..length word]]
          nn     = not . null

knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
                                 , Map.member e2 nw]

known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)

correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
    where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
          result     = head $ filter (not . null) start
          start      = [ known nw [word], known nw $ edits word
                       , knownEdits nw word , [word]]

用法
我就是这样用的:

ghci> t <- readFile "big.txt"
ghci> let nw = nwords t
ghci> correct nw "speling"
"spelling"

Peter Norvig网站上提供了big.txt文件(直接链接,6.2MB): http://norvig.com/big.txt

问题

  • 构建单词map需要年龄train函数比words'慢得多,因此它是瓶颈。
  • 内存使用情况很疯狂。玩了一段时间后,我得到了近1 GB。

那么,我在哪里弄错了?我某处有内存泄漏吗?

2 个答案:

答案 0 :(得分:3)

我的主要建议是:

  • 使用高效的字符串类型(即Text / ByteString或其懒惰变体)
  • 使用更好的哈希映射实现 - 类似于Data.HashMap.Strict
  • 编写自定义单词解析器而不是使用正则表达式

以下代码可以在大约2秒内将所有big.txt加载到Data.Hashmap.Strict。内存使用量约为25 MB(在64位系统上):

import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.List

isAlpha ch = ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')

wrds :: T.Text -> [ T.Text ]
wrds bs =
  let
      (_, r1) = T.span (not . isAlpha) bs
      (w, r2) = T.span isAlpha r1
  in if T.null w then [] else T.toLower w : wrds r2

readDict = do
  allwords <- fmap wrds $ T.readFile "big.txt"
  let h = foldl' add H.empty all words
      add h w = let c = H.lookupDefault (0 :: Int)  w h
                in  H.insert w (c+1) h
      member = \k -> H.member k h
      frequency = \k -> H.lookupDefault 0 k h
  return (member, frequency)

使用惰性文本可能更有效 - 需要调查的内容。

这是我实施的其余部分 - 几乎遵循Norvig,但我做了一些你可能感兴趣的其他选择:

{-# LANGUAGE OverloadedStrings #-}

module SpellText
where

import qualified Data.Text as T
import Data.Text (Text)
import Data.Monoid
import Data.List.Ordered (nubSort)
import Data.Ord
import Data.List
import Control.Monad

type Dict = ( Text -> Bool, Text -> Int )

singles :: [ Text ]
singles = map T.singleton ['a'..'z']

edits :: Text -> [ Text ]
edits w = deletes <> nubSort (transposes <> replaces) <> inserts
  where
    splits     = zip (T.inits w) (T.tails w)
    deletes    = [ a <> (T.drop 1 b) | (a,b) <- splits, T.length b > 0 ]
    transposes = [ a <> c <> (T.drop 2 b) | (a,b) <- splits, T.length b > 1,
                   let c = T.pack [ T.index b 1, T.index b 0 ] ]
    replaces   = [ a <> c <> (T.drop 1 b) | (a,b) <- splits, T.length b > 1,
                    c <- singles ]
    inserts    = [ a <> c <> b | (a,b) <- splits, c <- singles ]

orElse :: [a] -> [a] -> [a]
orElse [] bs = bs
orElse as _  = as

-- | Correct a word. 'isMember' and 'frequency' are functions to
--   determine if a word is in the dictionary and to lookup its
--   frequency, respectively.
correct :: Dict -> Text -> Text 
correct (isMember,frequency) w0 = 
  let ed0 = [ w0 ]
      ed1 = edits w0
      ed2 = [ e2 | e1 <- ed1, e2 <- edits e1 ]

      kn0 = filter isMember ed0
      kn1 = filter isMember ed1
      kn2 = filter isMember ed2

      candidates = kn0 `orElse` (kn1 `orElse` (kn2 `orElse` [w0]))
  in maximumBy (comparing frequency) candidates

用法是这样的:

{-# LANGUAGE OverloadedStrings #-}
import ... -- import the above code

main = do
  dictfns <- readDict
  print $ correct dictfns "howwa"

我测量的校正时间与Python版本相当 - 可能快10%。

答案 1 :(得分:1)

恕我直言,它看起来并不那么慢。 nwords消耗的时间最多。另一个函数correct实际上非常快。在我的机器上,nwords需要大约6.5秒,correct需要不到0.1秒,这与每秒10个字的处理速度相匹配,这是Peter Norvigs的目标。

我可以通过force train函数的输入和输出来提高20%的性能。

我用ghc -o Spl -O2 Spl.hs编译了程序。

module Main (nwords, correct, main)
    where

import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))

import Control.DeepSeq (deepseq, force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX

import qualified Data.Set as Set
import qualified Data.Map.Strict as Map

type NWords = Map.Map String Int


alphabet :: String
alphabet = enumFromTo 'a' 'z'

nwords :: String -> Map.Map String Int
nwords = train . words'

uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
    where uniq' _ [] = []
          uniq' seen (x:xs)
            | x `Set.member` seen = uniq' seen xs
            | otherwise           = x:uniq' (x `Set.insert` seen) xs

words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower

-- have a 20% performance improvement by using 'force' on input and output
train :: [String] -> NWords
train = force . foldl' populate Map.empty . force
    where populate m feature = Map.insertWith (+) feature 1 m

edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
    where dels   = [a ++ tail b | (a,b) <- splits, nn b]
          trans  = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
                   , length b > 1]
          repl   = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
          ins    = [a ++ c:b | (a,b) <- splits, c <- alphabet]
          splits = [splitAt n word | n <- [0..length word]]
          nn     = not . null

knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
                                 , Map.member e2 nw]

known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)

correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
    where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
          result     = head $ filter (not . null) start
          start      = [ known nw [word], known nw $ edits word
                       , knownEdits nw word , [word]]

main = do
    time0 <- getPOSIXTime
    t <- readFile "big.txt"
    time1 <- getPOSIXTime
    putStrLn $ ":: readFile: "++(show $ time1-time0)

    let nw = nwords t
    evaluate $ force nw
    time2 <- getPOSIXTime
    putStrLn $ ":: nwords: " ++ (show $ time2-time1)

    putStrLn $ correct nw "speling"
    putStrLn $ correct nw "miracl"
    putStrLn $ correct nw "helllo"
    putStrLn $ correct nw "rabit"
    putStrLn $ correct nw "kitteen"
    putStrLn $ correct nw "breaks"
    putStrLn $ correct nw "sometheeng"
    putStrLn $ correct nw "clessical"
    putStrLn $ correct nw "theater"
    putStrLn $ correct nw "dishis"
    time3 <- getPOSIXTime
    putStrLn $ ":: correcting: " ++ (show $ time3-time2)

    let a = time1-time0
    let b = time2-time1
    let c = time3-time2
    let total = time3 - time0
    putStrLn $ ":: total:  "++(show $ time3-time0)

这是我的输出:

:: readFile: 0.000202s
:: nwords: 6.063617s
spelling
miracle
hello
habit
kitten
breaks
something
classical
theater
dishes
:: correcting: 0.749441s
:: total:  6.81326s

此外,如果您不使用正则表达式,nwords函数会快两倍:

words' :: String -> [String]
words' str = map (map toLower) $ words str
    where words str = if (null a)
                        then (if null b then [] else words f)
                        else a:(words d)
                            where (a,b) = span  isAlpha str
                                  (c,d) = break isAlpha b
                                  (e,f) = break isAlpha str

有趣的是,如果你试图纠正未知单词,纠正它们需要更长的时间。

(我是一名哈斯克尔初学者,我试图通过回答stackoverflow问题以及编写一些玩具程序来学习该语言。)

关于内存消耗:

在我的系统中,此版本最多只需要21MB。如果您使用正则表达式,或者在force中使用train,则似乎存在空间泄漏。如果省略两者,则表现良好。我认为这与自编words'函数比正则表达式words'函数更加懒惰这一事实有关。

module Main (nwords, correct, main)
    where

import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')

import Control.DeepSeq (force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import Data.Char (isAlpha)

import qualified Data.Set as Set
import qualified Data.Map.Strict as Map

type NWords = Map.Map String Int


alphabet :: String
alphabet = enumFromTo 'a' 'z'

nwords :: String -> Map.Map String Int
nwords = train . words'

uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
    where uniq' _ [] = []
          uniq' seen (x:xs)
            | x `Set.member` seen = uniq' seen xs
            | otherwise           = x:uniq' (x `Set.insert` seen) xs

words' :: String -> [String]
words' str = map (map toLower) $ words str
    where words str = if (null a)
                        then (if null b then [] else words f)
                        else a:(words d)
                            where (a,b) = span isAlpha str
                                  (c,d) = break isAlpha b
                                  (e,f) = break isAlpha str

train :: [String] -> NWords
train = foldl' populate Map.empty
    where populate m feature = Map.insertWith (+) feature 1 m

edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
    where dels   = [a ++ tail b | (a,b) <- splits, nn b]
          trans  = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
                   , length b > 1]
          repl   = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
          ins    = [a ++ c:b | (a,b) <- splits, c <- alphabet]
          splits = [splitAt n word | n <- [0..length word]]
          nn     = not . null

knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
                                 , Map.member e2 nw]

known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)

correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
    where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
          result     = head $ filter (not . null) start
          start      = [ known nw [word], known nw $ edits word
                       , knownEdits nw word , [word]]

main = do
    time0 <- getPOSIXTime
    t <- readFile "big.txt"
    time1 <- getPOSIXTime
    putStrLn $ ":: readFile: "++(show $ time1-time0)

    let nw = nwords t
    evaluate $ force nw
    time2 <- getPOSIXTime
    putStrLn $ ":: nwords: " ++ (show $ time2-time1)

    putStrLn $ correct nw "speling"
    putStrLn $ correct nw "miracl"
    putStrLn $ correct nw "helllo"
    putStrLn $ correct nw "rabit"
    putStrLn $ correct nw "kitteen"
    putStrLn $ correct nw "breaks"
    putStrLn $ correct nw "sometheeng"
    putStrLn $ correct nw "clessical"
    putStrLn $ correct nw "theater"
    putStrLn $ correct nw "dishis"
    time3 <- getPOSIXTime
    putStrLn $ ":: correcting: " ++ (show $ time3-time2)

    let a = time1-time0
    let b = time2-time1
    let c = time3-time2
    let total = time3 - time0
    putStrLn $ ":: total:  "++(show $ time3-time0)