Webcrawler - 获取链接

时间:2011-08-04 21:00:54

标签: f#

我正在尝试抓取网页,获取所有链接,并将其添加到最终将从该函数返回的list<string>

我的代码:

let getUrls s : seq<string> =
    let doc = new HtmlDocument() in 
              doc.LoadHtml s

    doc.DocumentNode.SelectNodes "//a[@href]"
    |> Seq.map(fun z -> (string z.Attributes.["href"]))

let crawler uri : seq<string> =
    let rec crawl url =
      let web = new WebClient() 
      let data = web.DownloadString url
      getUrls data |> Seq.map crawl (* <-- ERROR HERE *)

    crawl uri

问题是在抓取功能的最后一行(getUrls seq.map ...),它只会抛出一个错误:

  

类型不匹配。期待一个字符串 - &gt; '但是给了一个字符串    - &GT; SEQ&LT;'一&GT;在统一''a'时,结果类型将是无限的   和'seq&lt;'a&gt;'

3 个答案:

答案 0 :(得分:2)

crawl正在返回unit,但预计会返回seq<string>。我想你想要的东西:

let crawler uri =
  let rec crawl url =
    seq {
      let web = new WebClient() 
      let data = web.DownloadString url
      for url in getUrls data do
        yield url
        yield! crawl url
    }
  crawl uri

crawl添加类型注释应该指出问题。

答案 1 :(得分:0)

我认为是这样的:

let crawler (uri : seq<string>) =
    let rec crawl url =
        let data = Seq.empty
        getUrls data 
        |> Seq.toList
        |> function
            | h :: t -> 
                crawl h
                t |> List.iter crawl
            | _-> ()

    crawl uri

答案 2 :(得分:0)

为了获取链接:

    open System.Net
    open System.IO
    open System.Text.RegularExpressions

    type Url(x:string)=
     member this.tostring = sprintf "%A" x
     member this.request  = System.Net.WebRequest.Create(x)
     member this.response = this.request.GetResponse()
     member this.stream   = this.response.GetResponseStream()
     member this.reader   = new System.IO.StreamReader(this.stream)
     member this.html     = this.reader.ReadToEnd()

    let linkex                = "href=\s*\"[^\"h]*(http://[^&\"]*)\""

    let getLinks (txt:string) = [ 
                                 for m in Regex.Matches(txt,linkex) 
                                 -> m.Groups.Item(1).Value 
                                 ]

    let collectLinks (url:Url) =   url.html
                                |> getLinks