无法通过其id在html页面(一个简单示例)中找到元素

时间:2014-07-15 12:58:07

标签: haskell

解析html的简单示例:

import qualified Data.Text as T
import Text.HTML.DOM (parseLBS)
import Text.XML.Cursor (Cursor, attributeIs, content, element, fromDocument, child, ($//), (&|), (&//), (>=>), following)
import qualified Data.String (fromString) 

findNodes :: Cursor -> [Cursor]
findNodes = element (Data.String.fromString "div") >=> (Data.String.fromString "id") `attributeIs` (Data.String.fromString "large-user-info") 
  -- >=> 
  -- following >=> element (toXName "div") >=> (toXName "class") `attributeIs` (toT "reputation") >=>
  -- child >=> child

cursorFor :: IO Cursor
cursorFor = do
  page <- simpleHttp "http://stackoverflow.com/users/2813589/alexander-supertramp"
  return $ fromDocument $ parseLBS page

main :: IO ()
main = do
  cursor <- cursorFor
  print $ findNodes cursor

即便如此page返回整个页面(我已经检查过),findNodes会返回一个空列表 - 它始终会打印[]。我做错了什么?

1 个答案:

答案 0 :(得分:2)

我已经重新编写了一些代码。你没有得到任何东西,因为系统不知道如何预览数据给你。 div中有很多内部元素。

{-# LANGUAGE OverloadedStrings #-}

module ALSU where

import Network.HTTP.Conduit (simpleHttp)
import Text.HTML.DOM 
import Text.XML.Cursor (Cursor, attributeIs, attribute, node,  content, element, fromDocument, fromNode, child,
                        ($//), (&|), (&//), (>=>))
import qualified Data.Text as T
import qualified Data.String (fromString)

----------------------------------------------------------------------------

url = "http://stackoverflow.com/users/2813589/alexander-supertramp"

findNodes :: Cursor -> [Cursor]
findNodes = element "div" >=> attributeIs "id" "large-user-info"

-- Extract the data from each node in turn
--
extractData :: Cursor -> T.Text
extractData cursor = T.concat . content $ cursor

cursorFor :: String -> IO Cursor
cursorFor u = do
  page <- simpleHttp u
  return $ fromDocument $ parseLBS page

main :: IO ()
main = do
  cursor <- cursorFor url
  let divs = cursor $// findNodes &| extractData
  putStr $ show $ cursor $//findNodes    
  putStr $ show $ divs
  putStr "\n"

使用OverloadedStrings编译指示,因此您无需编写(Data.String.fromString )。如果您运行此代码,您将看到实际获取数据,您需要以您需要的方式重写extractData,具体取决于您要预览的内容。

这是输出

[Cursor @ NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header"),(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"large-user-info"),(Name {nameLocalName = "style", nameNamespace = Nothing, namePrefix = Nothing},"")], elementNodes = [NodeContent "\r\n        ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header-left")], elementNodes = [NodeContent "\r\n            ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"gravatar")], elementNodes = [NodeContent "\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://stackoverflow.com/users/2813589/alexander-supertramp")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "img", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "alt", nameNamespace = Nothing, namePrefix = Nothing},""),(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"logo"),(Name {nameLocalName = "height", nameNamespace = Nothing, namePrefix = Nothing},"128"),(Name {nameLocalName = "src", nameNamespace = Nothing, namePrefix = Nothing},"https://www.gravatar.com/avatar/0e1f310400630c00abfe892c212bfe18?s=128&d=identicon&r=PG&f=1"),(Name {nameLocalName = "width", nameNamespace = Nothing, namePrefix = Nothing},"128")], elementNodes = []})]}),NodeContent "\r\n                "]}),NodeContent "      \r\n                \r\n\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"reputation")], elementNodes = [NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                            ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"/users/2813589/alexander-supertramp?tab=reputation")], elementNodes = [NodeContent "1,780"]}),NodeContent "\r\n                    "]}),NodeContent "\r\n                    reputation\r\n                "]}),NodeContent "\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badges")], elementNodes = [NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"1 gold badge")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge1")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "1"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"5 silver badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge2")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "5"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"22 bronze badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge3")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "22"]})]})]}),NodeContent "                    \r\n                "]}),NodeContent "\r\n                \r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"change-picture-progress")], elementNodes = []}),NodeContent "\r\n            "]}),NodeContent "\r\n\r\n            ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"data")], elementNodes = [NodeContent "\r\n            ",NodeElement (Element {elementName = Name {nameLocalName = "table", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "bio"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "website"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"url"),(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://www.gildedhonour.com"),(Name {nameLocalName = "rel", nameNamespace = Nothing, namePrefix = Nothing},"nofollow me")], elementNodes = [NodeContent "gildedhonour.com"]})]}),NodeContent "\r\n                    "]}),NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "location"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"label adr")], elementNodes = [NodeContent "roaming in SE Asia"]}),NodeContent "\r\n                    "]}),NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "age"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n                    "]}),NodeContent "\r\n                "]}),NodeContent "\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "visits"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "member for"]}),NodeContent "\r\n\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"cool"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2013-09-25 04:05:54Z")], elementNodes = [NodeContent "9 months"]}),NodeContent "\r\n                    "]}),NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "seen"]}),NodeContent "\r\n\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"hot"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "\r\n                            ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"relativetime"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "32 mins ago"]}),NodeContent "\r\n                        "]}),NodeContent "\r\n                    "]}),NodeContent "\r\n                "]}),NodeContent "\r\n                ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-profile-stats")], elementNodes = [NodeContent "\r\n                    ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "stats"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "profile views"]}),NodeContent "\r\n                        ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "465"]}),NodeContent "\r\n                    "]}),NodeContent "\r\n\r\n                                "]}),NodeContent "\r\n            "]}),NodeContent "\r\n            "]}),NodeContent "\r\n\r\n        "]}),NodeContent "\r\n        \r\n        ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-about-me note")], elementNodes = [NodeContent "\r\n        "]}),NodeContent "\r\n        ",NodeElement (Element {elementName = Name {nameLocalName = "br", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"clear")], elementNodes = []}),NodeContent "\r\n    "]})][""]