不太确定是否可以这样做,但我的问题是:我的代码有问题吗?它没有我想要的那么快,因为我使用了大量的异步工作流程,也许我做错了。这里的目标是构建一些可以在不到一个小时内爬行20 000页的东西。
open System
open System.Text
open System.Net
open System.IO
open System.Text.RegularExpressions
open System.Collections.Generic
open System.ComponentModel
open Microsoft.FSharp
open System.Threading
//This is the Parallel.Fs file
type ComparableUri ( uri: string ) =
inherit System.Uri( uri )
let elts (uri:System.Uri) =
uri.Scheme, uri.Host, uri.Port, uri.Segments
interface System.IComparable with
member this.CompareTo( uri2 ) =
compare (elts this) (elts(uri2 :?> ComparableUri))
override this.Equals(uri2) =
compare this (uri2 :?> ComparableUri ) = 0
override this.GetHashCode() = 0
///////////////////////////////////////////////Functions to retrieve html string//////////////////////////////
let mutable error = Set.empty<ComparableUri>
let mutable visited = Set.empty<ComparableUri>
let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) =
async{
try
let req = (WebRequest.Create(uri)) :?> HttpWebRequest
// 'use' is equivalent to ‘using’ in C# for an IDisposable
req.UserAgent<-"Mozilla"
//Console.WriteLine("Waiting")
do! Async.Sleep(delay * 250)
let! resp = (req.AsyncGetResponse())
Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay)
use stream = resp.GetResponseStream()
use reader = new StreamReader(stream)
let html = reader.ReadToEnd()
return html
with
| _ as ex -> Console.WriteLine( ex.ToString() )
lock error (fun () -> error<- error.Add uri )
lock visited (fun () -> visited<-visited.Add uri )
return "BadUri"
}
///////////////////////////////////////////////Active Pattern Matching to retreive href//////////////////////////////
let (|Matches|_|) (pat:string) (inp:string) =
let m = Regex.Matches(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Count > 0
then Some (List.tail [ for g in m -> g.Value ])
else None
let (|Match|_|) (pat:string) (inp:string) =
let m = Regex.Match(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Success then
Some (List.tail [ for g in m.Groups -> g.Value ])
else
None
///////////////////////////////////////////////Find Bad href//////////////////////////////
let isEmail (link:string) =
link.Contains("@")
let isMailto (link:string) =
if Seq.length link >=6 then
link.[0..5] = "mailto"
else
false
let isJavascript (link:string) =
if Seq.length link >=10 then
link.[0..9] = "javascript"
else
false
let isBadUri (link:string) =
link="BadUri"
let isEmptyHttp (link:string) =
link="http://"
let isFile (link:string)=
if Seq.length link >=6 then
link.[0..5] = "file:/"
else
false
let containsPipe (link:string) =
link.Contains("|")
let isAdLink (link:string) =
if Seq.length link >=6 then
link.[0..5] = "adlink"
elif Seq.length link >=9 then
link.[0..8] = "http://adLink"
else
false
///////////////////////////////////////////////Find Bad href//////////////////////////////
let getHref (htmlString:string) =
let urlPat = "href=\"([^\"]+)"
match htmlString with
| Matches urlPat urls -> urls |> List.map( fun href -> match href with
| Match (urlPat) (link::[]) -> link
| _ -> failwith "The href was not in correct format, there was more than one match" )
| _ -> Console.WriteLine( "No links for this page" );[]
|> List.filter( fun link -> not(isEmail link) )
|> List.filter( fun link -> not(isMailto link) )
|> List.filter( fun link -> not(isJavascript link) )
|> List.filter( fun link -> not(isBadUri link) )
|> List.filter( fun link -> not(isEmptyHttp link) )
|> List.filter( fun link -> not(isFile link) )
|> List.filter( fun link -> not(containsPipe link) )
|> List.filter( fun link -> not(isAdLink link) )
let treatAjax (href:System.Uri) =
let link = href.ToString()
let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0]
new Uri(firstPart)
//only follow pages with certain extnsion or ones with no exensions
let followHref (href:System.Uri) =
let valid2 = set[".py"]
let valid3 = set[".php";".htm";".asp"]
let valid4 = set[".php3";".php4";".php5";".html";".aspx"]
let arrLength = href.Segments |> Array.length
let lastExtension = (href.Segments).[arrLength-1]
let lengthLastExtension = Seq.length lastExtension
if (lengthLastExtension <= 3) then
not( lastExtension.Contains(".") )
else
//test for the 2 case
let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)]
let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith( validEnd) )
if isValid2 then
true
else
if lengthLastExtension <= 4 then
not( last4.Contains(".") )
else
let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)]
let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith( validEnd) )
if isValid3 then
true
else
if lengthLastExtension <= 5 then
not( last5.Contains(".") )
else
let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)]
let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith( validEnd) )
if isValid4 then
true
else
not( last6.Contains(".") ) && not(lastExtension.[0..5] = "mailto")
//Create the correct links / -> add the homepage , make then a comparabel Uri
let hrefLinksToUri ( uri:ComparableUri ) (hrefLinks:string list) =
hrefLinks
|> List.map( fun link -> try
if Seq.length link <4 then
Some(new Uri( uri, link ))
else
if link.[0..3] = "http" then
Some(new Uri(link))
else
Some(new Uri( uri, link ))
with
| _ as ex -> Console.WriteLine(link);
lock error (fun () ->error<-error.Add uri)
None
)
|> List.filter( fun link -> link.IsSome )
|> List.map( fun o -> o.Value)
|> List.map( fun uri -> new ComparableUri( string uri ) )
//Treat uri , removing ajax last part , and only following links specified b Benoit
let linksToFollow (hrefUris:ComparableUri list) =
hrefUris
|>List.map( treatAjax )
|>List.filter( fun link -> followHref link )
|>List.map( fun uri -> new ComparableUri( string uri ) )
|>Set.ofList
let needToVisit uri =
( lock visited (fun () -> not( visited.Contains uri) ) ) && (lock error (fun () -> not( error.Contains uri) ))
let getLinksToFollowAsyncDelay (delay:int) ( uri: ComparableUri ) =
//write
async{
let! links = getHtmlPrimitiveAsyncDelay delay uri
lock visited (fun () ->visited<-visited.Add uri)
let linksToFollow = getHref links
|> hrefLinksToUri uri
|> linksToFollow
|> Set.filter( needToVisit )
return linksToFollow
}
let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,System.Diagnostics.Stopwatch >) =
let uriAuthority = uri.Authority
let hasAuthority,watch = authorityDelay.TryGetValue(uriAuthority)
if hasAuthority then
let elapsed = watch.Elapsed
let s = TimeSpan(0,0,0,0,500)-elapsed
if s.TotalMilliseconds < 0.0 then
0
else
int(s.TotalMilliseconds)
else
let temp = System.Diagnostics.Stopwatch()
temp.Start()
authorityDelay.Add(uriAuthority,temp)
0
let rec getLinksToFollowFromSetAsync maxIteration ( uris: seq<ComparableUri> ) =
let authorityDelay = Dictionary<string,System.Diagnostics.Stopwatch>()
if maxIteration = 100 then
Console.WriteLine("Finished")
else
//Unite by authority add delay for those we same authority others ignore
let stopwatch= System.Diagnostics.Stopwatch()
stopwatch.Start()
let newLinks = uris
|> Seq.map( fun uri -> let delay = lock authorityDelay (fun () -> getDelay uri authorityDelay )
getLinksToFollowAsyncDelay delay uri )
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.concat
stopwatch.Stop()
Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n")
getLinksToFollowFromSetAsync (maxIteration+1) newLinks
seq[set[ComparableUri( "http://rue89.com/" )]]
|>PSeq.ofSeq
|>PSeq.iter(getLinksToFollowFromSetAsync 0 )
getLinksToFollowFromSetAsync 0 (seq[ComparableUri( "http://twitter.com/" )])
Console.WriteLine("Finished")
有些反馈会很棒!谢谢(注意这只是我为了好玩而做的事情)
答案 0 :(得分:3)
我认为罪魁祸首是do! Async.Sleep(delay * 250)
行 - 你逐渐等待的时间越来越长。它是什么原因?