作为练习,我在Haskell写了Peter Norvig的spelling corrector algorithm:
module Spl (nwords, correct)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
用法
我就是这样用的:
ghci> t <- readFile "big.txt"
ghci> let nw = nwords t
ghci> correct nw "speling"
"spelling"
Peter Norvig网站上提供了big.txt
文件(直接链接,6.2MB):
http://norvig.com/big.txt
问题
train
函数比words'
慢得多,因此它是瓶颈。那么,我在哪里弄错了?我某处有内存泄漏吗?
答案 0 :(得分:3)
我的主要建议是:
Data.HashMap.Strict
以下代码可以在大约2秒内将所有big.txt
加载到Data.Hashmap.Strict
。内存使用量约为25 MB(在64位系统上):
import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.List
isAlpha ch = ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')
wrds :: T.Text -> [ T.Text ]
wrds bs =
let
(_, r1) = T.span (not . isAlpha) bs
(w, r2) = T.span isAlpha r1
in if T.null w then [] else T.toLower w : wrds r2
readDict = do
allwords <- fmap wrds $ T.readFile "big.txt"
let h = foldl' add H.empty all words
add h w = let c = H.lookupDefault (0 :: Int) w h
in H.insert w (c+1) h
member = \k -> H.member k h
frequency = \k -> H.lookupDefault 0 k h
return (member, frequency)
使用惰性文本可能更有效 - 需要调查的内容。
这是我实施的其余部分 - 几乎遵循Norvig,但我做了一些你可能感兴趣的其他选择:
{-# LANGUAGE OverloadedStrings #-}
module SpellText
where
import qualified Data.Text as T
import Data.Text (Text)
import Data.Monoid
import Data.List.Ordered (nubSort)
import Data.Ord
import Data.List
import Control.Monad
type Dict = ( Text -> Bool, Text -> Int )
singles :: [ Text ]
singles = map T.singleton ['a'..'z']
edits :: Text -> [ Text ]
edits w = deletes <> nubSort (transposes <> replaces) <> inserts
where
splits = zip (T.inits w) (T.tails w)
deletes = [ a <> (T.drop 1 b) | (a,b) <- splits, T.length b > 0 ]
transposes = [ a <> c <> (T.drop 2 b) | (a,b) <- splits, T.length b > 1,
let c = T.pack [ T.index b 1, T.index b 0 ] ]
replaces = [ a <> c <> (T.drop 1 b) | (a,b) <- splits, T.length b > 1,
c <- singles ]
inserts = [ a <> c <> b | (a,b) <- splits, c <- singles ]
orElse :: [a] -> [a] -> [a]
orElse [] bs = bs
orElse as _ = as
-- | Correct a word. 'isMember' and 'frequency' are functions to
-- determine if a word is in the dictionary and to lookup its
-- frequency, respectively.
correct :: Dict -> Text -> Text
correct (isMember,frequency) w0 =
let ed0 = [ w0 ]
ed1 = edits w0
ed2 = [ e2 | e1 <- ed1, e2 <- edits e1 ]
kn0 = filter isMember ed0
kn1 = filter isMember ed1
kn2 = filter isMember ed2
candidates = kn0 `orElse` (kn1 `orElse` (kn2 `orElse` [w0]))
in maximumBy (comparing frequency) candidates
用法是这样的:
{-# LANGUAGE OverloadedStrings #-}
import ... -- import the above code
main = do
dictfns <- readDict
print $ correct dictfns "howwa"
我测量的校正时间与Python版本相当 - 可能快10%。
答案 1 :(得分:1)
nwords
消耗的时间最多。另一个函数correct
实际上非常快。在我的机器上,nwords
需要大约6.5秒,correct
需要不到0.1秒,这与每秒10个字的处理速度相匹配,这是Peter Norvigs的目标。
我可以通过force
train
函数的输入和输出来提高20%的性能。
我用ghc -o Spl -O2 Spl.hs
编译了程序。
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import Control.DeepSeq (deepseq, force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
-- have a 20% performance improvement by using 'force' on input and output
train :: [String] -> NWords
train = force . foldl' populate Map.empty . force
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)
这是我的输出:
:: readFile: 0.000202s
:: nwords: 6.063617s
spelling
miracle
hello
habit
kitten
breaks
something
classical
theater
dishes
:: correcting: 0.749441s
:: total: 6.81326s
此外,如果您不使用正则表达式,nwords
函数会快两倍:
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
有趣的是,如果你试图纠正未知单词,纠正它们需要更长的时间。
(我是一名哈斯克尔初学者,我试图通过回答stackoverflow问题以及编写一些玩具程序来学习该语言。)
关于内存消耗:
在我的系统中,此版本最多只需要21MB。如果您使用正则表达式,或者在force
中使用train
,则似乎存在空间泄漏。如果省略两者,则表现良好。我认为这与自编words'
函数比正则表达式words'
函数更加懒惰这一事实有关。
module Main (nwords, correct, main)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Control.DeepSeq (force)
import Control.Exception (evaluate)
import Data.Time.Clock.POSIX
import Data.Char (isAlpha)
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' str = map (map toLower) $ words str
where words str = if (null a)
then (if null b then [] else words f)
else a:(words d)
where (a,b) = span isAlpha str
(c,d) = break isAlpha b
(e,f) = break isAlpha str
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
main = do
time0 <- getPOSIXTime
t <- readFile "big.txt"
time1 <- getPOSIXTime
putStrLn $ ":: readFile: "++(show $ time1-time0)
let nw = nwords t
evaluate $ force nw
time2 <- getPOSIXTime
putStrLn $ ":: nwords: " ++ (show $ time2-time1)
putStrLn $ correct nw "speling"
putStrLn $ correct nw "miracl"
putStrLn $ correct nw "helllo"
putStrLn $ correct nw "rabit"
putStrLn $ correct nw "kitteen"
putStrLn $ correct nw "breaks"
putStrLn $ correct nw "sometheeng"
putStrLn $ correct nw "clessical"
putStrLn $ correct nw "theater"
putStrLn $ correct nw "dishis"
time3 <- getPOSIXTime
putStrLn $ ":: correcting: " ++ (show $ time3-time2)
let a = time1-time0
let b = time2-time1
let c = time3-time2
let total = time3 - time0
putStrLn $ ":: total: "++(show $ time3-time0)