我有一个有效的网络刮刀代码,但我的问题是它很慢地抓取链接。比如每刮一个随机的分钟间隔。 我的程序所做的是它擦除第一个HTML中的所有链接然后它在第一个HTML的每个抓取链接中擦除一个链接。有没有办法让这更快?我正在使用后台工作者。所以后台工作者不是问题,而是代码本身。
这是我的工作代码:
Dim sList As New List(Of String)
Dim INwebClient As New System.Net.WebClient
Dim INWebSource As String = INwebClient.DownloadString("http://www.yelp.com/search?find_desc=Hotels&find_loc=CA&ns=1&ls=88145bf794a78999#")
Dim INhtmlDoc As New HtmlAgilityPack.HtmlDocument()
INhtmlDoc.LoadHtml(INWebSource)
Dim counter As Integer = 0
For Each INlink As HtmlNode In INhtmlDoc.DocumentNode.SelectNodes("//a[@href]")
Dim INatt As HtmlAttribute = INlink.Attributes("href")
If INatt.Value.Contains("/biz") Then
Dim INholder = INlink.Attributes("href").Value
Dim INconverter As String = INholder.ToString
INoutput = INconverter.Insert(INconverter.IndexOf("/biz"), "http://www.yelp.com")
sList.Add(INoutput)
End If
Next
For Each Uri As String In sList
Dim webClient As New System.Net.WebClient
Dim WebSource As String = webClient.DownloadString(Uri)
Dim htmlDoc As New HtmlAgilityPack.HtmlDocument()
htmlDoc.LoadHtml(WebSource)
For Each link As HtmlNode In htmlDoc.DocumentNode.SelectNodes("//a[@href]")
Dim att As HtmlAttribute = link.Attributes("href")
If att.Value.Contains("/biz_share") Then
Dim holder = link.Attributes("href").Value
Dim converter As String = holder.ToString
Dim output As String = converter.Insert(converter.IndexOf("/biz"), "http://www.yelp.com")
If output.Contains("reviewid") = False Then
If Not ListBox1.Items.Contains(output) Then
ListBox1.Items.Add(output)
counter = counter + 1
End If
End If
End If
Next
Label1.Text = counter
Next