下面是Haskell代码(HTTP)下载给定目录中缺少的文件:
module Main where
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
mapM_ (\(index, fileName) -> do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
hPutStrLn stderr $ "Downloading " ++ show url
resp <- simpleHTTP request
case resp of
Left _ -> hPutStrLn stderr $ "Error connecting to " ++ show url
Right response -> do
let
html = rspBody response
file <- openFile fileName WriteMode
hPutStr file html
hClose file
return ()) missing
我想并行运行下载。我知道par
,但我不确定它是否可以在IO
monad中使用,如果可以,如何使用?
更新:以下是使用Control.Concurrent.Async
和mapConcurrently
重新实现的代码:
module Main where
import Control.Concurrent.Async ( mapConcurrently )
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
pages <-
mapConcurrently (\(index, fileName) -> getUrl index fileName) missing
mapM_ (\(fileName, html) -> do
handle <- openFile fileName WriteMode
hPutStr handle html
hClose handle) pages
where
getUrl :: String -> FilePath -> IO (FilePath, String)
getUrl index fileName = do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
resp <- simpleHTTP request
case resp of
Left _ -> do
hPutStrLn stderr $ "Error connecting to " ++ show url
return ("", "")
Right response ->
return (fileName, rspBody response)
答案 0 :(得分:13)
这看起来正是async
的设计目标,实际上这个例子是并行下载的。还有一个关于此的演示文稿 - http://skillsmatter.com/podcast/home/high-performance-concurrency - 非常值得一试。
答案 1 :(得分:12)
由于操作涉及IO,因此通常会/不使用par
,因为它对IO操作没有任何作用。
您需要一个显式并发模型,以隐藏下载的延迟。
我推荐MVars或TVars,结合forkIO。
工作队列抽象通常对此类问题很有用:将所有URL推入队列,并为N个核心提供一组固定的工作线程(例如N * k),直到完成工作。然后,完成的工作将附加到交回主线程的通信渠道。
以下是parallel URL checker使用频道的示例。
答案 2 :(得分:8)
从Simon Marlow的“异步”库中查看mapConcurrently
。
它将IO
动作并行和异步映射到Traversable
容器的元素,并等待所有操作。
示例:
{-# LANGUAGE PackageImports #-}
import System.Environment (getArgs)
import "async" Control.Concurrent.Async (mapConcurrently)
import "HTTP" Network.HTTP
import "HTTP" Network.Stream (Result)
import "HTTP" Network.HTTP.Base (Response(..))
import System.IO
import "url" Network.URL (encString)
import Control.Monad
getURL :: String -> IO (String, Result (Response String))
getURL url = do
res <- (simpleHTTP . getRequest) url
return (url, res)
main = do
args <- getArgs
case args of
[] -> putStrLn "usage: program url1 url2 ... urlN"
args -> do
results <- mapConcurrently getURL args
forM_ results $ \(url, res) -> do
case res of
Left connError -> putStrLn $ url ++ "; " ++ show connError
Right response -> do
putStrLn $ url ++ "; OK"
let content = rspBody response
-- make name from url
fname = encString True (`notElem` ":/") url ++ ".html"
writeFile fname content
答案 3 :(得分:2)
另一个使用async's mapConcurrently和http-conduit keep-alive manager
的版本{-# LANGUAGE PackageImports, FlexibleContexts #-}
import System.Environment (getArgs)
import "http-conduit" Network.HTTP.Conduit
import qualified "conduit" Data.Conduit as C
import "http-types" Network.HTTP.Types.Status (ok200)
import "async" Control.Concurrent.Async (mapConcurrently)
import qualified "bytestring" Data.ByteString.Lazy as LBS
import qualified "bytestring" Data.ByteString as BS
import "transformers" Control.Monad.Trans.Class (lift)
import "transformers" Control.Monad.IO.Class (liftIO)
import "url" Network.URL (encString)
import "failure" Control.Failure (Failure(..))
import Control.Monad
import System.IO
taggedRequest :: Failure HttpException m => String -> m (String, Request m')
taggedRequest url = do
req <- parseUrl url
return (url, req)
taggedResult :: (C.MonadBaseControl IO m, C.MonadResource m) => Manager -> (String, Request m) -> m (String, Response LBS.ByteString)
taggedResult manager (url, req) = do
res <- httpLbs req manager
return (url, res)
main = do
args <- getArgs
case args of
[] -> putStrLn "usage: program url1 url2 ... urlN"
args -> do
requests <- mapM taggedRequest args
withManager $ \manager -> liftIO $ do
results <- mapConcurrently (C.runResourceT . taggedResult manager) requests
forM_ results $ \(url, Response status _ _ bsBody) -> do
putStrLn $ url ++ " ; " ++ show status
let fileName = encString True (`notElem` ":/") url ++ ".html"
when (status == ok200) $ LBS.writeFile fileName bsBody