我的应用程序是一个将信息存储在数据库中的Web scraper(大部分)。到目前为止,我有两节课:
我的测试程序会查看所有网址,抓取内容,推送到数据库中。顺序非常简单,但我想说N个线程运行这些进程(刮擦和存储)。我的顺序代码是:
Private Sub Button4_Click(sender As Object, e As EventArgs) Handles Button4.Click
'Grab List
Dim tDS As New DataSet
Dim tDB As New clsTermsDB
Dim tSpyder As New clsAGDSpyder
Dim sResult As New TermsRuns
'Grab a list of all URLS
tDS = tDB.GetTermsList(1)
Try
For Each Row As DataRow In tDS.Tables(0).Rows
rtbList.AppendText(Row("url_toBeCollected") & vbCrLf)
sResult = tSpyder.SpiderPage(Row("url_toBeCollected"))
'If nothing is found, do not store
If sResult.html <> "" And sResult.text <> "" Then
tDB.InsertScrape(Now(), sResult.html, sResult.text, Row("url_uid"), 1)
End If
Next
Exit Sub
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
End Sub
考虑到这一点,并注意到我将变量传递给SpiderPage和InsertScrape方法..我怎么能实现线程?它必须简单,但我觉得我一直在谷歌搜索和尝试几天没有成功:(
*** ADDED:SpiderPage方法:
Public Function SpiderPage(PageURL As String) As TermsRuns
Dim webget As New HtmlWeb
Dim node As HtmlNode
Dim doc As New HtmlDocument
Dim docNOHTML As HtmlDocument
Dim uri As New Uri(PageURL)
Dim wc As HttpWebRequest = DirectCast(WebRequest.Create(uri.AbsoluteUri), HttpWebRequest)
Dim wcStream As Stream
wc.AllowAutoRedirect = True
wc.MaximumAutomaticRedirections = 3
'Set Headers
wc.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"
wc.Headers.Add("REMOTE_ADDR", "66.83.101.5")
wc.Headers.Add("HTTP_REFERER", "66.83.101.5")
'Set HTMLAgility Kit Useragent Spoofing (not needed, I don't think)
webget.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"
'Certification STuff
wc.UseDefaultCredentials = True
wc.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials
ServicePointManager.ServerCertificateValidationCallback = AddressOf AcceptAllCertifications
'Create Cookie Jar
Dim CookieJar As New CookieContainer
wc.CookieContainer = CookieJar
'Keep Alive Settings
wc.KeepAlive = True
wc.Timeout = &H7530
'Read the web page
Dim wr As HttpWebResponse = Nothing
Try
wcStream = wc.GetResponse.GetResponseStream
doc.Load(wcStream)
'Remove HTML from the document
docNOHTML = RemoveUnWantedTags(doc)
'Grab only the content inside the <body> tag
node = docNOHTML.DocumentNode.SelectSingleNode("//body")
'Output
SpiderPage = New TermsRuns
SpiderPage.html = node.InnerHtml
SpiderPage.text = node.InnerText
Return SpiderPage
Catch ex As Exception
'Something goes here when scraping returns an error
SpiderPage = New TermsRuns
SpiderPage.html = ""
SpiderPage.text = ""
End Try
End Function
***添加了InsertScrape:
Public Function InsertScrape(scrape_ts As DateTime, scrape_html As String, scrape_text As String, url_id As Integer, tas_id As Integer) As Boolean
Dim myCommand As MySqlClient.MySqlCommand
Dim dt As New DataTable
'Create ds/dt for fill
Dim ds As New DataSet
Dim dtbl As New DataTable
Try
'Set Connection String
myConn.ConnectionString = myConnectionString
'Push Command to Client Object
myCommand = New MySqlClient.MySqlCommand
myCommand.Connection = myConn
myCommand.CommandText = "spInsertScrape"
myCommand.CommandType = CommandType.StoredProcedure
myCommand.Parameters.AddWithValue("@scrape_ts", scrape_ts)
myCommand.Parameters("@scrape_ts").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("@scrape_html", scrape_html)
myCommand.Parameters("@scrape_html").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("@scrape_text", scrape_text)
myCommand.Parameters("@scrape_text").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("@url_id", url_id)
myCommand.Parameters("@url_id").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("@tas_id", tas_id)
myCommand.Parameters("@tas_id").Direction = ParameterDirection.Input
'Open Connection
myConn.Open()
myCommand.ExecuteNonQuery()
'Close Connection
myConn.Close()
InsertScrape = True
Catch ex As Exception
'Put Message Here
InsertScrape = False
MessageBox.Show(ex.Message)
End Try
End Function
提前感谢。