异步Webcrawling F#,出了什么问题?

时间:2010-06-11 13:33:48

标签: f# web-crawler

不太确定是否可以这样做,但我的问题是:我的代码有问题吗?它没有我想要的那么快,因为我使用了大量的异步工作流程,也许我做错了。这里的目标是构建一些可以在不到一个小时内爬行20 000页的东西。

    open System
    open System.Text
    open System.Net
    open System.IO
    open System.Text.RegularExpressions
    open System.Collections.Generic
    open System.ComponentModel
    open Microsoft.FSharp
    open System.Threading
    //This is the Parallel.Fs file

    type ComparableUri ( uri: string ) = 

        inherit System.Uri( uri )

        let elts (uri:System.Uri) = 
            uri.Scheme, uri.Host, uri.Port, uri.Segments

        interface System.IComparable with 
            member this.CompareTo( uri2 ) = 
                compare (elts this) (elts(uri2 :?> ComparableUri))

        override this.Equals(uri2) = 
            compare this (uri2 :?> ComparableUri ) = 0

        override this.GetHashCode() = 0 


    ///////////////////////////////////////////////Functions to retrieve html string//////////////////////////////
    let mutable error = Set.empty<ComparableUri>
    let mutable visited = Set.empty<ComparableUri>

    let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri)  =
        async{
                try
                    let req =  (WebRequest.Create(uri)) :?> HttpWebRequest

                    // 'use' is equivalent to ‘using’ in C# for an IDisposable
                    req.UserAgent<-"Mozilla"

                    //Console.WriteLine("Waiting")
                    do! Async.Sleep(delay * 250)
                    let! resp =    (req.AsyncGetResponse())
                    Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay)
                    use stream = resp.GetResponseStream()
                    use reader = new StreamReader(stream)
                    let html = reader.ReadToEnd()
                    return html
                with 
                | _ as ex -> Console.WriteLine( ex.ToString() ) 
                             lock error (fun () -> error<- error.Add uri )
                             lock visited (fun () -> visited<-visited.Add uri )
                             return "BadUri"
                     }



    ///////////////////////////////////////////////Active Pattern Matching to retreive href//////////////////////////////

    let (|Matches|_|) (pat:string) (inp:string) =
        let m = Regex.Matches(inp, pat)
        // Note the List.tl, since the first group is always the entirety of the matched string.
        if m.Count > 0
        then Some (List.tail [ for g in m -> g.Value ])
        else None

    let (|Match|_|) (pat:string) (inp:string) =
        let m = Regex.Match(inp, pat) 
        // Note the List.tl, since the first group is always the entirety of the matched string.
        if m.Success then 
            Some (List.tail [ for g in m.Groups -> g.Value ])
        else 
            None
    ///////////////////////////////////////////////Find Bad  href//////////////////////////////

    let isEmail (link:string) = 
        link.Contains("@")

    let isMailto (link:string) = 
        if Seq.length link >=6 then 
            link.[0..5] = "mailto"
        else
            false

    let isJavascript (link:string) = 
         if Seq.length link >=10 then 
            link.[0..9] = "javascript"
         else
            false

    let isBadUri (link:string) = 
          link="BadUri"

    let isEmptyHttp (link:string) = 
        link="http://"

    let isFile (link:string)=
         if Seq.length link >=6 then 
            link.[0..5] = "file:/"
         else
            false

    let containsPipe (link:string) = 
        link.Contains("|")


    let isAdLink (link:string) = 
          if Seq.length link >=6 then 
            link.[0..5] = "adlink"
          elif Seq.length link >=9 then 
            link.[0..8] = "http://adLink"
          else
            false

///////////////////////////////////////////////Find Bad  href//////////////////////////////

    let getHref (htmlString:string) = 

        let urlPat = "href=\"([^\"]+)"

        match htmlString with 
        | Matches urlPat urls -> urls |> List.map( fun href -> match href with 
                                                               | Match (urlPat) (link::[]) -> link
                                                               | _ -> failwith "The href was not in correct format, there was more than one match" )

        | _ -> Console.WriteLine( "No links for this page" );[] 
        |> List.filter( fun link -> not(isEmail link) )
        |> List.filter( fun link -> not(isMailto link) )
        |> List.filter( fun link -> not(isJavascript link) )
        |> List.filter( fun link -> not(isBadUri link) )
        |> List.filter( fun link -> not(isEmptyHttp link) )
        |> List.filter( fun link -> not(isFile link) )
        |> List.filter( fun link -> not(containsPipe link) )
        |> List.filter( fun link -> not(isAdLink link) )

    let treatAjax (href:System.Uri)  = 
        let link = href.ToString()
        let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0]
        new Uri(firstPart)

    //only follow pages with certain extnsion or ones with no exensions
    let followHref (href:System.Uri) = 

        let valid2 = set[".py"]
        let valid3 = set[".php";".htm";".asp"]
        let valid4 = set[".php3";".php4";".php5";".html";".aspx"]



        let arrLength = href.Segments |> Array.length
        let lastExtension = (href.Segments).[arrLength-1] 
        let lengthLastExtension = Seq.length lastExtension

        if (lengthLastExtension <= 3)  then 
            not( lastExtension.Contains(".") )
        else
            //test for the 2 case
            let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)]

            let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith( validEnd) )

            if isValid2 then 
                true
            else
                if lengthLastExtension <= 4 then 
                    not( last4.Contains(".") )
                else
                    let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)]
                    let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith( validEnd) )

                    if isValid3 then 
                        true
                    else
                        if lengthLastExtension <= 5 then 
                            not( last5.Contains(".") )
                        else
                            let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)]
                            let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith( validEnd) )

                            if isValid4 then 
                                true
                            else
                                not( last6.Contains(".") ) && not(lastExtension.[0..5] = "mailto")




//Create the correct links / -> add the homepage , make then a comparabel Uri
let hrefLinksToUri ( uri:ComparableUri ) (hrefLinks:string list)  = 
    hrefLinks
    |> List.map( fun link -> try 
                                 if Seq.length link <4 then 
                                    Some(new Uri( uri, link ))
                                 else 
                                    if link.[0..3] = "http" then  
                                        Some(new Uri(link))
                                    else
                                        Some(new Uri( uri, link ))

                             with
                             | _ as ex -> Console.WriteLine(link);
                                          lock error (fun () ->error<-error.Add uri)
                                          None
                            )
    |> List.filter( fun link -> link.IsSome )
    |> List.map( fun o -> o.Value)
    |> List.map( fun uri -> new ComparableUri( string uri ) )

//Treat uri , removing ajax last part , and only following links specified b Benoit
let linksToFollow (hrefUris:ComparableUri list) = 
    hrefUris
    |>List.map( treatAjax )
    |>List.filter( fun link -> followHref link )
    |>List.map( fun uri -> new ComparableUri( string uri ) )
    |>Set.ofList



let needToVisit uri = 
      ( lock visited (fun () -> not( visited.Contains uri) ) ) && (lock error (fun () -> not( error.Contains uri) ))



let getLinksToFollowAsyncDelay (delay:int) ( uri: ComparableUri )  = 
    //write 
    async{    
              let! links = getHtmlPrimitiveAsyncDelay delay uri 

              lock visited (fun () ->visited<-visited.Add uri)

              let linksToFollow = getHref links
                                  |> hrefLinksToUri uri
                                  |> linksToFollow
                                  |> Set.filter( needToVisit )
              return linksToFollow
              }

let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,System.Diagnostics.Stopwatch >) = 

    let uriAuthority = uri.Authority
    let hasAuthority,watch = authorityDelay.TryGetValue(uriAuthority)

    if hasAuthority then 
        let elapsed = watch.Elapsed
        let s = TimeSpan(0,0,0,0,500)-elapsed
        if s.TotalMilliseconds < 0.0 then 
            0
        else
            int(s.TotalMilliseconds)

    else 
        let temp = System.Diagnostics.Stopwatch()
        temp.Start()
        authorityDelay.Add(uriAuthority,temp)
        0




let rec getLinksToFollowFromSetAsync maxIteration  ( uris: seq<ComparableUri> )  = 

    let authorityDelay = Dictionary<string,System.Diagnostics.Stopwatch>()

    if maxIteration = 100 then 
        Console.WriteLine("Finished")
    else
        //Unite by authority add delay for those we same authority others ignore 
        let stopwatch= System.Diagnostics.Stopwatch()
        stopwatch.Start()
        let newLinks  = uris
                        |> Seq.map(  fun uri -> let delay = lock authorityDelay (fun () -> getDelay uri authorityDelay )
                                                getLinksToFollowAsyncDelay delay uri )
                        |> Async.Parallel
                        |> Async.RunSynchronously
                        |> Seq.concat
        stopwatch.Stop()
        Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n")

        getLinksToFollowFromSetAsync (maxIteration+1) newLinks

seq[set[ComparableUri( "http://rue89.com/" )]]
|>PSeq.ofSeq
|>PSeq.iter(getLinksToFollowFromSetAsync 0 )

    getLinksToFollowFromSetAsync 0 (seq[ComparableUri( "http://twitter.com/" )])

    Console.WriteLine("Finished")

有些反馈会很棒!谢谢(注意这只是我为了好玩而做的事情)

1 个答案:

答案 0 :(得分:3)

我认为罪魁祸首是do! Async.Sleep(delay * 250)行 - 你逐渐等待的时间越来越长。它是什么原因?