解析html的简单示例:
import qualified Data.Text as T
import Text.HTML.DOM (parseLBS)
import Text.XML.Cursor (Cursor, attributeIs, content, element, fromDocument, child, ($//), (&|), (&//), (>=>), following)
import qualified Data.String (fromString)
findNodes :: Cursor -> [Cursor]
findNodes = element (Data.String.fromString "div") >=> (Data.String.fromString "id") `attributeIs` (Data.String.fromString "large-user-info")
-- >=>
-- following >=> element (toXName "div") >=> (toXName "class") `attributeIs` (toT "reputation") >=>
-- child >=> child
cursorFor :: IO Cursor
cursorFor = do
page <- simpleHttp "http://stackoverflow.com/users/2813589/alexander-supertramp"
return $ fromDocument $ parseLBS page
main :: IO ()
main = do
cursor <- cursorFor
print $ findNodes cursor
即便如此page
返回整个页面(我已经检查过),findNodes
会返回一个空列表 - 它始终会打印[]
。我做错了什么?
答案 0 :(得分:2)
我已经重新编写了一些代码。你没有得到任何东西,因为系统不知道如何预览数据给你。 div中有很多内部元素。
{-# LANGUAGE OverloadedStrings #-}
module ALSU where
import Network.HTTP.Conduit (simpleHttp)
import Text.HTML.DOM
import Text.XML.Cursor (Cursor, attributeIs, attribute, node, content, element, fromDocument, fromNode, child,
($//), (&|), (&//), (>=>))
import qualified Data.Text as T
import qualified Data.String (fromString)
----------------------------------------------------------------------------
url = "http://stackoverflow.com/users/2813589/alexander-supertramp"
findNodes :: Cursor -> [Cursor]
findNodes = element "div" >=> attributeIs "id" "large-user-info"
-- Extract the data from each node in turn
--
extractData :: Cursor -> T.Text
extractData cursor = T.concat . content $ cursor
cursorFor :: String -> IO Cursor
cursorFor u = do
page <- simpleHttp u
return $ fromDocument $ parseLBS page
main :: IO ()
main = do
cursor <- cursorFor url
let divs = cursor $// findNodes &| extractData
putStr $ show $ cursor $//findNodes
putStr $ show $ divs
putStr "\n"
使用OverloadedStrings
编译指示,因此您无需编写(Data.String.fromString )
。如果您运行此代码,您将看到实际获取数据,您需要以您需要的方式重写extractData
,具体取决于您要预览的内容。
这是输出
[Cursor @ NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header"),(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"large-user-info"),(Name {nameLocalName = "style", nameNamespace = Nothing, namePrefix = Nothing},"")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header-left")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"gravatar")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://stackoverflow.com/users/2813589/alexander-supertramp")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "img", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "alt", nameNamespace = Nothing, namePrefix = Nothing},""),(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"logo"),(Name {nameLocalName = "height", nameNamespace = Nothing, namePrefix = Nothing},"128"),(Name {nameLocalName = "src", nameNamespace = Nothing, namePrefix = Nothing},"https://www.gravatar.com/avatar/0e1f310400630c00abfe892c212bfe18?s=128&d=identicon&r=PG&f=1"),(Name {nameLocalName = "width", nameNamespace = Nothing, namePrefix = Nothing},"128")], elementNodes = []})]}),NodeContent "\r\n "]}),NodeContent " \r\n \r\n\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"reputation")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"/users/2813589/alexander-supertramp?tab=reputation")], elementNodes = [NodeContent "1,780"]}),NodeContent "\r\n "]}),NodeContent "\r\n reputation\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badges")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"1 gold badge")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge1")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "1"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"5 silver badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge2")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "5"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"22 bronze badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge3")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "22"]})]})]}),NodeContent " \r\n "]}),NodeContent "\r\n \r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"change-picture-progress")], elementNodes = []}),NodeContent "\r\n "]}),NodeContent "\r\n\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"data")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "table", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "bio"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "website"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"url"),(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://www.gildedhonour.com"),(Name {nameLocalName = "rel", nameNamespace = Nothing, namePrefix = Nothing},"nofollow me")], elementNodes = [NodeContent "gildedhonour.com"]})]}),NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "location"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"label adr")], elementNodes = [NodeContent "roaming in SE Asia"]}),NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "age"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n "]}),NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "visits"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "member for"]}),NodeContent "\r\n\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"cool"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2013-09-25 04:05:54Z")], elementNodes = [NodeContent "9 months"]}),NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "seen"]}),NodeContent "\r\n\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"hot"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"relativetime"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "32 mins ago"]}),NodeContent "\r\n "]}),NodeContent "\r\n "]}),NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-profile-stats")], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "stats"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "profile views"]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "465"]}),NodeContent "\r\n "]}),NodeContent "\r\n\r\n "]}),NodeContent "\r\n "]}),NodeContent "\r\n "]}),NodeContent "\r\n\r\n "]}),NodeContent "\r\n \r\n ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-about-me note")], elementNodes = [NodeContent "\r\n "]}),NodeContent "\r\n ",NodeElement (Element {elementName = Name {nameLocalName = "br", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"clear")], elementNodes = []}),NodeContent "\r\n "]})][""]