Haskell:从文件中获取字符串列表

时间:2017-05-30 18:07:01

标签: haskell

我的意见是:

Plaster ["BD..", ".GA.D", ".FEG", "ABDCF", "E..."]

我想要得到的东西:

["BD..", ".GA.D", ".FEG", "ABDCF", "E..."]

我现在的代码:

go = do --print "Enter file name"
        --path <- getLine
        file <- (readFile "1.txt")
        print file
        let list = consume file 
        print list
        let content = (wordsWhen (=='"') list) 
        print content
        print (content !! 0)
        print (content !! 1)
        print (content !! 2)


wordsWhen :: (Char -> Bool) -> String -> [String]
wordsWhen p s =  case dropWhile p s of
                      "" -> []
                      ", " -> []
                      s' -> w : wordsWhen p s''
                            where (w, s'') = break p s'

consume []       = []
consume ('[':xs) = consume' xs
consume (_  :xs) = consume xs

consume' []       = [] 
consume' (']':xs) = []
consume' (x  :xs) = x : consume' xs

所以我正在做的是

  • 从目的地读取文件(现在硬编码用于测试)
  • 摆脱&#34;石膏&#34; consume
  • 使用wordsWhen
  • 从文件中获取所有字符串

我为wordsWhen尝试了不同的分隔符,但我无法获得所需内容。在当前形式中,输出为:

"Plaster [\"BD..\", \".GA.D\", \".FEG\", \"ABDCF\", \"E...\"]"
"\"BD..\", \".GA.D\", \".FEG\", \"ABDCF\", \"E...\""
["BD..",", ",".GA.D",", ",".FEG",", ","ABDCF",", ","E..."]
"BD.."
", "
".GA.D"

这是非常准确的,但我想摆脱只包含逗号的单词。我可以将分隔符更改为逗号(我认为它应该是这样),但是然后输出带有所有这些斜杠和引号,如下所示:

"Plaster [\"BD..\", \".GA.D\", \".FEG\", \"ABDCF\", \"E...\"]"
"\"BD..\", \".GA.D\", \".FEG\", \"ABDCF\", \"E...\""
["\"BD..\""," \".GA.D\""," \".FEG\""," \"ABDCF\""," \"E...\""]
"\"BD..\""
" \".GA.D\""
" \".FEG\""

有没有办法修复我的代码?或者我应该以不同的方式做到这一点?

编辑:由于这是我的练习,我只能使用标准类型和功能。

4 个答案:

答案 0 :(得分:4)

好吧,你可以通过定义一个与你现有输入相匹配的Read实例的数据类型来作弊:

{-# OPTIONS_GHC -Wall -Werror -Wno-name-shadowing #-}
module Main where

data Input = Plaster [String] deriving (Read, Show)

main :: IO ()
main = do
  Plaster xs <- readIO =<< readFile "1.txt"
  _ <- traverse print (zip [0 :: Int ..] xs)
  return ()

这对我来说非常适合ghc-8.0.2:

$ cat "1.txt"
Plaster ["BD..", ".GA.D", ".FEG", "ABDCF", "E..."]
$ ghc --make SO44269043.hs && ./SO44269043
(0,"BD..")
(1,".GA.D")
(2,".FEG")
(3,"ABDCF")
(4,"E...")

或者,您可以定义自己的读取实例:

data Input = Plaster [String]

instance Read Input where
  readsPrec p = readParen (p >= 10) . runR $ do
    Plaster <$> (string "Plaster" *> many1 whitespace *> R readList)

如果您不熟悉<$>*>,可能会更容易理解为

  readsPrec p = readParen (p >= 10) . runR $ do
    _ <- string "Plaster"
    _ <- many1 whitespace
    xs <- R readList
    return (Plaster xs)

即使没有导入,定义解析器类型R也不是很多代码,基本上只是一个包装器,可以让你为String -> [(a, String)]定义一个monad实例:

newtype R a = R { runR :: ReadS a }

instance Functor R where
  fmap f = R . fmap (map (\(a, s) -> (f a, s))) . runR

instance Applicative R where
  pure a = R $ \s -> [(a, s)]
  mf <*> ma = R $ \s -> do
    (f, s) <- runR mf s
    (a, s) <- runR ma s
    return (f a, s)

instance Monad R where
  m >>= f = R $ \s -> do
    (a, s) <- runR m s
    runR (f a) s

>>=(或绑定)运算符只是意味着“解析一些字符串 使用左侧的解析器,然后解析其余的字符串 将结果值传递给右侧的函数后。“

我们现在免费获得R readList :: R [String],所以我们需要做的就是 跳过最初的“石膏”和它与之间的任何空格 字符串列表:

string :: String -> R String
string = traverse char

many1 :: R a -> R [a]
many1 r = loop where
  loop = (:) <$> r <*> (loop <|> return [])

whitespace :: R Char
whitespace = char ' ' <|> char '\t' <|> char '\n' <|> char '\r'

通常我们会使用<|>中的GHC.Base,但在此处定义一次性并不难。基本上 r <|> r'表示“尝试使用r进行解析,如果失败,请尝试使用r'进行解析”

(<|>) :: R a -> R a -> R a
r <|> r' = R $ \s -> runR r s ++ runR r' s

现在我们所需要的只是匹配单个角色的能力:

char :: Char -> R Char
char c = R $ \s -> case s of
  (c' : s) | c == c'  -> [(c, s)]
  _                   -> []

如果即使使用Prelude.readList也太容易了,我们可以为列表和带引号的字符串定义我们自己的解析器:

  readsPrec p = readParen (p >= 10) . runR $ do
    Plaster <$> (string "Plaster" *> many1 whitespace *> listOf quotedString)

其中列表只有一个前导'[',一个尾随']'和一些分隔的术语:

listOf :: R a -> R [a]
listOf term = char '[' *> (term `sepBy` string ", ") <* char ']'

sepBy :: R a -> R b -> R [a]
sepBy term delim = sepBy1 term delim <|> return []

sepBy1 :: R a -> R b -> R [a]
sepBy1 term delim = loop where
  loop = (:) <$> term <*> ((delim *> loop) <|> return [])

同样,带引号的字符串只有一个前导'“',一个尾随'”'和一些转义字符:

quotedString :: R String
quotedString = char '"' *> many escapedChar <* char '"'

many :: R a -> R [a]
many r = many1 r <|> return []

escapedChar :: R Char
escapedChar = R $ \s -> case s of
  '\\' : '\\' : s               -> [('\\', s)]
  '\\' : '"' : s                -> [('"', s)]
  c : s | c /= '\\' && c /= '"' -> [(c, s)]
  _                             -> []

值得注意的是many/many1sepBy/sepBy1之间的相似性 - 如果我们真的那么 懒惰,我们可以用另一个来定义一个:

many1 r = r `sepBy1` return ()
term `sepBy1` delim = (:) <$> term <*> many (delim *> term)

答案 1 :(得分:2)

这是如何做到的:

getArgs

要获取第一个命令行参数,请使用1

答案 2 :(得分:1)

这是一个快速而又脏的解析器。 要小心,它只适用于格式良好的输入,不具备性能且代码不是分解的。但是没有作弊;) 也许它可以给你一些灵感来解决你的运动。

plaster :: String -> String
plaster ('P':'l':'a':'s':'t':'e':'r':' ':xs) = xs
plaster s = undefined


brackets :: String -> String
brackets ('[':xs) = brackets xs
brackets (x:']':_) = [x]
brackets (x:xs) = x:brackets xs


quotes :: String -> String
quotes ('"':xs) = quotes xs
quotes (x:'"':_) = [x]
quotes (x:xs) = x:quotes xs


sepByComma :: String -> [String]
sepByComma s = go s ""
  where
    go [] acc = [acc] 
    go (',':' ':xs) acc  = [acc] ++ go xs ""
    go (x:xs) acc = go xs (acc ++ [x])


parse :: String -> [String]
parse s = map quotes . sepByComma . brackets . plaster $ s

答案 3 :(得分:0)

以下是仅使用基础知识的替代方案,不包括MonadsFunctorsApplicative运算符。

main :: IO()
main = do
  input <- getLine
  let output = parse input
  print output

parse :: String -> [String]
parse = map stripQuotes . parse' . tokenize []
  where
    parse' :: [String] -> [String]
    -- If the input matches the pattern, call parseList on the inner tokens.
    -- Does not nest brackets!  This is a simple regex match.
    parse' ("Plaster":"[":tokens) | last tokens == "]" =
      parseList [] (removeLast tokens)
    parse' _ = error "The input does not have the form \"Plaster [...]\"."

parseList :: [String] -> [String] -> [String]
-- Empty list.
parseList tokens [] = tokens
-- Unexpected tokens.
parseList _ (",":_) = error "Unexpected comma."
parseList _ ("[":_) = error "No support for nested brackets."
parseList _ ("]":_) = error "Unexpected input after \"]\"."
-- One-element list.
parseList tokens [x] = tokens ++ [x]
-- Comma-separated list with at least two elements.
parseList tokens (x:",":y:ys) = parseList (tokens ++ [x]) (y:ys)
-- Comma at end of list, so we don’t want to give the "expected comma" error!
parseList _ [_,","] = error "Extra comma at end of list."
-- More than one element not separated by commas.
parseList _ (x:_) = error $ "Expected comma after \"" ++ x ++ "\"."

stripQuotes :: String -> String
stripQuotes ('"':xs) | last xs == '"' = removeLast xs
stripQuotes xs = error $ "Expected string literal instead of " ++ xs ++ "."

removeLast :: [a] -> [a]
removeLast xs = take ((length xs) - 1) xs

whitespace :: [Char]
whitespace = [' ', '\n', '\t'] -- Incomplete, but sufficient.

isWhitespace :: Char -> Bool
isWhitespace c = elem c whitespace

tokenize :: [String] -> String -> [String]
-- If we’ve consumed all the input, we’re done.
tokenize tokens [] = tokens
-- We’d need something a little more complicated for longer operators:
tokenize tokens ('[':xs) = tokenize (tokens ++ ["["]) xs
tokenize tokens (']':xs) = tokenize (tokens ++ ["]"]) xs
tokenize tokens (',':xs) = tokenize (tokens ++ [","]) xs
-- Not currently processing a token, so skip whitespace.
-- Otherwise, start a new token.
tokenize tokens (x:xs) | isWhitespace x = tokenize tokens xs
                       | otherwise      = tokenize' tokens [x] xs
  where
    tokenize' :: [String] -> String -> String -> [String]
-- If we’ve consumed all the input, the current token is the last.
    tokenize' ts t [] = ts ++ [t]
-- If we encounter an operator, it is the token after the current one.
    tokenize' ts t ('[':ys) = tokenize (ts ++ [t] ++ ["["]) ys
    tokenize' ts t (']':ys) = tokenize (ts ++ [t] ++ ["]"]) ys
    tokenize' ts t (',':ys) = tokenize (ts ++ [t] ++ [","]) ys
-- Whitespace means the current token is complete.
-- Otherwise, append y to the current token and continue.
    tokenize' ts t (y:ys) | isWhitespace y = tokenize (ts ++ [t]) ys
                          | otherwise      = tokenize' ts (t ++ [y]) ys

你不会在生产代码中这样做;这对于正则表达式来说很简单,并且解析(或多或少)是一个已解决的问题。 Parser组合器是时尚的方式。