我正在尝试将大量数据写入常量内存中的文件。
import qualified Data.ByteString.Lazy as B
{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
rng <- newPureMT
let (grids,shuffleds) = createGrids rng aa
createDirectoryIfMissing True "data/grids/"
B.writeFile (gridFileName num aa)
(encode (take num grids))
B.writeFile (shuffledFileName num aa)
(encode (take num shuffleds))
然而,这会消耗与num
大小成比例的内存。我知道createGrids
是一个足够懒惰的函数,因为我通过将error "not lazy enough"
(由Haskell wiki here建议)附加到它返回的列表的末尾并且没有引发错误来测试它。 take
是一个惰性函数,在Data.List
中定义。 encode
也是Data.Binary
中定义的惰性函数。 B.writeFile
中定义了Data.ByteString.Lazy
。
以下是完整的代码,您可以执行它:
import Control.Arrow (first)
import Data.Binary
import GHC.Float (double2Float)
import System.Random (next)
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble)
import System.Random.Shuffle (shuffle')
import qualified Data.ByteString.Lazy as B
main :: IO ()
main = writeGrids 1000 64
{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
rng <- newPureMT
let (grids,shuffleds) = createGrids rng aa
B.writeFile "grids.bin" (encode (take num grids))
B.writeFile "shuffleds.bin" (encode (take num shuffleds))
{- a random number generator, dimension of grids to make
returns a pair of lists, the first is a list of grids of dimensions
aa x aa, the second is a list of the shuffled grids corresponding to the first list -}
createGrids :: PureMT -> Int -> ([[(Float,Float)]],[[(Float,Float)]])
createGrids rng aa = (grids,shuffleds) where
rs = randomFloats rng
grids = map (getGridR aa) (chunksOf (2 * aa * aa) rs)
shuffleds = shuffler (aa * aa) rng grids
{- length of each grid, a random number generator, a list of grids
returns a the list with each grid shuffled -}
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [[(Float,Float)]]
shuffler n rng (xs:xss) = shuffle' xs n rng : shuffler n (snd (next rng)) xss
shuffler _ _ [] = []
{- divides list into chunks of size n -}
chunksOf :: Int -> [a] -> [[a]]
chunksOf n = go
where go xs = case splitAt n xs of
(ys,zs) | null ys -> []
| otherwise -> ys : go zs
{- dimension of grid, list of random floats [0,1]
returns a list of (x,y) points of length n^2 such that all
points are in the range [0,1] and the points are a randomly
perturbed regular grid -}
getGridR :: Int -> [Float] -> [(Float,Float)]
getGridR n rs = pts where
nn = n * n
(irs,jrs) = splitAt nn rs
n' = fromIntegral n
grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ]
pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs)
{- an infinite list of random floats in range [0,1] -}
randomFloats :: PureMT -> [Float]
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng)
in d : randomFloats rng'
所需的套餐包括: ,bytestring ,二进制 ,随机 ,mersenne-random-pure64 ,random-shuffle
答案 0 :(得分:10)
内存使用的两个原因:
首先,Data.Binary.encode
似乎无法在恒定的空间中运行。以下程序使用910 MB内存:
import Data.Binary
import qualified Data.ByteString.Lazy as B
len = 10000000 :: Int
main = B.writeFile "grids.bin" $ encode [0..len]
如果我们从0
离开len
,我们将获得97 MB的内存使用量。
相反,以下程序使用1 MB:
import qualified Data.ByteString.Lazy.Char8 as B
main = B.writeFile "grids.bin" $ B.pack $ show [0..(1000000::Int)]
第二,在您的程序shuffleds
中包含对grids
内容的引用,这会阻止grids
的垃圾回收。因此,当我们打印grids
时,我们也会对其进行评估,然后必须在内存中进行打印,直到我们完成打印shuffleds
。您的程序的以下版本仍然消耗大量内存,但如果我们使用B.writeFile
注释掉这两行中的一行,则它使用常量空间。
import qualified Data.ByteString.Lazy.Char8 as B
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
rng <- newPureMT
let (grids,shuffleds) = createGrids rng aa
B.writeFile "grids.bin" (B.pack $ show (take num grids))
B.writeFile "shuffleds.bin" (B.pack $ show (take num shuffleds))
答案 1 :(得分:7)
对于它的价值,这里是一个完整的解决方案,结合了每个人的想法。内存消耗量恒定在~6MB(使用-O2
编译)。
import Control.Arrow (first)
import Control.Monad.State (state, evalState)
import Data.Binary
import GHC.Float (double2Float)
import System.Random (next)
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble)
import System.Random.Shuffle (shuffle')
import qualified Data.ByteString as B (hPut)
import qualified Pipes.Binary as P (encode)
import qualified Pipes.Prelude as P (zip, mapM, drain)
import Pipes (runEffect, (>->))
import System.IO (withFile, IOMode(AppendMode))
main :: IO ()
main = writeGrids 1000 64
{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
rng <- newPureMT
let (grids, shuffleds) = createGrids rng aa
gridFile = "grids.bin"
shuffledFile = "shuffleds.bin"
encoder = P.encode . SerList . take num
writeFile gridFile ""
writeFile shuffledFile ""
withFile gridFile AppendMode $ \hGr ->
withFile shuffledFile AppendMode $ \hSh ->
runEffect
$ P.zip (encoder grids) (encoder shuffleds)
>-> P.mapM (\(ch1, ch2) -> B.hPut hGr ch1 >> B.hPut hSh ch2)
>-> P.drain -- discards the stream of () results.
{- a random number generator, dimension of grids to make
returns a pair of lists, the first is a list of grids of dimensions
aa x aa, the second is a list of the shuffled grids corresponding to the first list -}
createGrids :: PureMT -> Int -> ( [[(Float,Float)]], [[(Float,Float)]] )
createGrids rng aa = unzip gridsAndShuffleds where
rs = randomFloats rng
grids = map (getGridR aa) (chunksOf (2 * aa * aa) rs)
gridsAndShuffleds = shuffler (aa * aa) rng grids
{- length of each grid, a random number generator, a list of grids
returns a the list with each grid shuffled -}
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [( [(Float,Float)], [(Float,Float)] )]
shuffler n rng xss = evalState (traverse oneShuffle xss) rng
where
oneShuffle xs = state $ \r -> ((xs, shuffle' xs n r), snd (next r))
newtype SerList a = SerList { runSerList :: [a] }
deriving (Show)
instance Binary a => Binary (SerList a) where
put (SerList (x:xs)) = put False >> put x >> put (SerList xs)
put _ = put True
get = do
stop <- get :: Get Bool
if stop
then return (SerList [])
else do
x <- get
SerList xs <- get
return (SerList (x : xs))
{- divides list into chunks of size n -}
chunksOf :: Int -> [a] -> [[a]]
chunksOf n = go
where go xs = case splitAt n xs of
(ys,zs) | null ys -> []
| otherwise -> ys : go zs
{- dimension of grid, list of random floats [0,1]
returns a list of (x,y) points of length n^2 such that all
points are in the range [0,1] and the points are a randomly
perturbed regular grid -}
getGridR :: Int -> [Float] -> [(Float,Float)]
getGridR n rs = pts where
nn = n * n
(irs,jrs) = splitAt nn rs
n' = fromIntegral n
grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ]
pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs)
{- an infinite list of random floats in range [0,1] -}
randomFloats :: PureMT -> [Float]
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng)
in d : randomFloats rng'
对变化的评论:
shuffler
现在是State
仿函数的遍历。它通过输入列表在单个传递中生成一对对列表,其中每个网格与其混洗版本配对。 createGrids
然后(懒惰地)解压缩此列表。
文件是使用pipes
机制编写的,其方式受this answer的启发(我最初使用P.foldM
编写)。请注意,我使用的hPut
是严格的字节串,因为它作用于由P.zip
生成的生成器提供的严格块(在本质上,它是一对延迟的字节串,成对提供块) )。
SerList
可以保留Thomas M. DuBuisson提到的自定义Binary
实例。请注意,我没有过多考虑实例的get
方法中的懒惰和严格性。如果这会给您带来麻烦,this question看起来很有用。