当尝试使用Aeson解析115MB JSON文件时,Haskell程序内存不足

时间:2017-11-28 13:12:05

标签: json parsing haskell out-of-memory aeson

我的Haskell程序在尝试解析115MB JSON文件时内存不足。我怀疑我做了一些你不应该在Haskell做的事情 - 在程序的早期阶段,由于我在String而不是{{1}进行操作,因此内存不足} s - 但我无法弄清楚是什么。

我已将我的程序浓缩为以下MWE:

ByteString

输入格式如下:

{-# LANGUAGE GeneralizedNewtypeDeriving, OverloadedStrings, FlexibleInstances #-}

----------------------------------------
-- Imports
----------------------------------------

import System.Environment
  ( getArgs )
import Control.Monad
  ( mzero
  , when
  )

import qualified Data.ByteString.Lazy.Char8 as BS
import Data.Aeson
import Data.Maybe
import Data.Scientific
  ( Scientific )

import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V


----------------------------------------
-- Data types
----------------------------------------

newtype Natural
  = Natural Integer
  deriving (Show, Eq, Ord)

instance Num Natural where
    fromInteger = toNatural
    x + y = toNatural (fromNatural x + fromNatural y)
    x - y = let r = fromNatural x - fromNatural y
            in if r < 0
               then error "Subtraction yielded a negative value"
               else toNatural r
    x * y = toNatural (fromNatural x * fromNatural y)
    abs x = x
    signum x = toNatural $ signum $ fromNatural x

instance Enum Natural where
  toEnum = toNatural . toInteger
  fromEnum = fromInteger . fromNatural

instance Real Natural where
  toRational (Natural i) = toRational i

instance Integral Natural where
  quotRem (Natural x) (Natural y) =
    ( toNatural $ quot x y
    , toNatural $ rem x y
    )
  toInteger (Natural i) = i

instance FromJSON Natural where
  parseJSON (Number sn) = return $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON Natural where
  toJSON i = toJSON (fromNatural i)

----------------------------------------

data PatternMatchset
  = PatternMatchset
      { pmTarget :: TargetMachineID
      , pmMatches :: [PatternMatch]
      , pmTime :: Maybe Double
      }
  deriving (Show)

instance FromJSON PatternMatchset where
  parseJSON (Object v) =
    PatternMatchset
      <$> v .: "target-machine-id"
      <*> v .: "match-data"
      <*> v .: "time"
  parseJSON _ = mzero

instance ToJSON PatternMatchset where
  toJSON m =
    object [ "target-machine-id" .= (pmTarget m)
           , "match-data"        .= (pmMatches m)
           , "time"              .= (pmTime m)
           ]

----------------------------------------

data PatternMatch
  = PatternMatch
      { pmInstrID :: InstructionID
      , pmMatchID :: MatchID
      , pmMatch :: Match NodeID
      }
  deriving (Show)

instance FromJSON PatternMatch where
  parseJSON (Object v) =
    PatternMatch
      <$> v .: "instr-id"
      <*> v .: "match-id"
      <*> v .: "match"
  parseJSON _ = mzero

instance ToJSON PatternMatch where
  toJSON m =
    object [ "instr-id"   .= (pmInstrID m)
           , "match-id"   .= (pmMatchID m)
           , "match"      .= (pmMatch m)
           ]

----------------------------------------

data Match n
  = Match { f2pMaps :: M.Map n [n]
          , p2fMaps :: M.Map n [n]
          }
  deriving (Show, Eq, Ord)

instance FromJSON (Match NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       return $ toMatch list
  parseJSON _ = mzero

instance ToJSON (Match NodeID) where
  toJSON m = toJSON $ fromMatch m

----------------------------------------

data Mapping n
  = Mapping
      { fNode :: n
      , pNode :: n
      }
  deriving (Show, Eq, Ord)

instance FromJSON (Mapping NodeID) where
  parseJSON v@(Array _) =
    do list <- parseJSON v
       when (length list /= 2) mzero
       return Mapping { fNode = head list
                      , pNode = last list
                      }
  parseJSON _ = mzero

instance ToJSON (Mapping NodeID) where
  toJSON m = Array (V.fromList [toJSON $ fNode m, toJSON $ pNode m])

----------------------------------------

newtype MatchID
  = MatchID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON MatchID where
  parseJSON (Number sn) = return $ toMatchID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON MatchID where
  toJSON mid = toJSON (fromMatchID mid)

----------------------------------------

newtype NodeID
  = NodeID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON NodeID where
  parseJSON (Number sn) = return $ toNodeID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON NodeID where
  toJSON mid = toJSON (fromNodeID mid)

----------------------------------------

newtype InstructionID
  = InstructionID Natural
  deriving (Show, Eq, Ord, Num, Enum, Real, Integral)

instance FromJSON InstructionID where
  parseJSON (Number sn) = return $ toInstructionID $ sn2nat sn
  parseJSON _ = mzero

instance ToJSON InstructionID where
  toJSON mid = toJSON (fromInstructionID mid)

----------------------------------------

newtype TargetMachineID
  = TargetMachineID String
  deriving (Show, Eq)

instance FromJSON TargetMachineID where
  parseJSON (String s) = return $ toTargetMachineID $ T.unpack s
  parseJSON _ = mzero

instance ToJSON TargetMachineID where
  toJSON tmid = toJSON (fromTargetMachineID tmid)


----------------------------------------
-- Help functions
----------------------------------------

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, 'Nothing' is
-- returned.
maybeToNatural :: (Integral i) => i -> Maybe Natural
maybeToNatural x
  | x < 0     = Nothing
  | otherwise = Just $ Natural $ toInteger x

-- | Converts an 'Integral' into a 'Natural'. If conversion fails, an error is
-- reported.
toNatural :: (Integral i) => i -> Natural
toNatural x =
  let n = maybeToNatural x
  in if isJust n
     then fromJust n
     else error $ "toNatural: negative number: " ++
                  show (toInteger x :: Integer)

-- | Converts a 'Natural' into an 'Integer'.
fromNatural :: Natural -> Integer
fromNatural (Natural i) = i

-- | Converts a scientific number to a natural number. If the number is not an
-- non-negative then an error occurs.
sn2nat :: Scientific -> Natural
sn2nat sn =
  let int_value = round sn
  in if fromInteger int_value /= sn
     then error $ "sn2nat: not an integer: " ++ show sn
     else toNatural int_value

fromTargetMachineID :: TargetMachineID -> String
fromTargetMachineID (TargetMachineID i) = i

toTargetMachineID :: String -> TargetMachineID
toTargetMachineID = TargetMachineID

fromMatchID :: MatchID -> Natural
fromMatchID (MatchID i) = i

toMatchID :: (Integral i) => i -> MatchID
toMatchID = MatchID . toNatural

fromNodeID :: NodeID -> Natural
fromNodeID (NodeID i) = i

toNodeID :: (Integral i) => i -> NodeID
toNodeID = NodeID . toNatural

fromInstructionID :: InstructionID -> Natural
fromInstructionID (InstructionID i) = i

toInstructionID :: (Integral i) => i -> InstructionID
toInstructionID = InstructionID . toNatural

toMatch :: Ord n => [Mapping n] -> Match n
toMatch ms =
  let insert (n1, n2) m = M.insertWith (++) n1 [n2] m
  in Match { f2pMaps = foldr insert M.empty $
                       map (\m -> (fNode m, pNode m)) ms
           , p2fMaps = foldr insert M.empty $
                       map (\m -> (pNode m, fNode m)) ms
           }

fromMatch :: Ord n => Match n -> [Mapping n]
fromMatch m =
  M.foldrWithKey
    (\fn pns ms -> (ms ++ map (\pn -> Mapping { fNode = fn, pNode = pn }) pns))
    []
    (f2pMaps m)


----------------------------------------
-- Main program
----------------------------------------

main :: IO ()
main =
  do args <- getArgs
     when (length args == 0) $
       error $ "No input file"
     when (length args > 1) $
       error $ "Too many arguments"
     let file = head args
     str <- BS.readFile file
     let pmset = decode str
     when (isNothing pmset) $
       error $ "Failed to parse JSON"
     putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)

上面的程序只是解析JSON文件,将其转换回JSON并打印数据。要获得更大的输入文件,只需将对象复制粘贴到{ "match-data": [ { "instr-id": 31, "match": [ [2354, 5], [2343, 3], [2341, 10], [2340, 9], [1478, 8], [1476, 6] ], "match-id": 0 } ], "target-machine-id": "Architecture", "time": 27.642428397 } 列表中并将其附加到列表中即可。

我尝试使用-O2标志编译程序,但无济于事。

1 个答案:

答案 0 :(得分:1)

尝试更改:

putStrLn $ BS.unpack $ encode (fromJust pmset :: PatternMatchset)

 BS.putStrLn $ encode (fromJust pmset :: PatternMatchset)

前者导致我的机器进入交换地狱。后者完成得很好。