Question

我的应用程序是一个将信息存储在数据库中的Web scraper（大部分）。到目前为止，我有两节课：

clsSpyder - 这基本上卷起了刮刀流程
clsDB - 这可以处理任何数据库进程

我的测试程序会查看所有网址，抓取内容，推送到数据库中。顺序非常简单，但我想说N个线程运行这些进程（刮擦和存储）。我的顺序代码是：

Private Sub Button4_Click(sender As Object, e As EventArgs) Handles Button4.Click


    'Grab List
    Dim tDS As New DataSet
    Dim tDB As New clsTermsDB
    Dim tSpyder As New clsAGDSpyder
    Dim sResult As New TermsRuns

    'Grab a list of all URLS
    tDS = tDB.GetTermsList(1)

    Try

        For Each Row As DataRow In tDS.Tables(0).Rows

            rtbList.AppendText(Row("url_toBeCollected") & vbCrLf)
            sResult = tSpyder.SpiderPage(Row("url_toBeCollected"))

            'If nothing is found, do not store
            If sResult.html <> "" And sResult.text <> "" Then
                tDB.InsertScrape(Now(), sResult.html, sResult.text, Row("url_uid"), 1)
            End If

        Next

        Exit Sub

    Catch ex As Exception
        MessageBox.Show(ex.Message)
    End Try
End Sub

考虑到这一点，并注意到我将变量传递给SpiderPage和InsertScrape方法..我怎么能实现线程？它必须简单，但我觉得我一直在谷歌搜索和尝试几天没有成功：（

*** ADDED：SpiderPage方法：

    Public Function SpiderPage(PageURL As String) As TermsRuns
    Dim webget As New HtmlWeb
    Dim node As HtmlNode
    Dim doc As New HtmlDocument
    Dim docNOHTML As HtmlDocument

    Dim uri As New Uri(PageURL)
    Dim wc As HttpWebRequest = DirectCast(WebRequest.Create(uri.AbsoluteUri), HttpWebRequest)
    Dim wcStream As Stream


    wc.AllowAutoRedirect = True
    wc.MaximumAutomaticRedirections = 3

    'Set Headers
    wc.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"
    wc.Headers.Add("REMOTE_ADDR", "66.83.101.5")
    wc.Headers.Add("HTTP_REFERER", "66.83.101.5")


    'Set HTMLAgility Kit Useragent Spoofing (not needed, I don't think)
    webget.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"

    'Certification STuff
    wc.UseDefaultCredentials = True
    wc.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials
    ServicePointManager.ServerCertificateValidationCallback = AddressOf AcceptAllCertifications

    'Create Cookie Jar
    Dim CookieJar As New CookieContainer
    wc.CookieContainer = CookieJar

    'Keep Alive Settings
    wc.KeepAlive = True
    wc.Timeout = &H7530

    'Read the web page
    Dim wr As HttpWebResponse = Nothing
    Try

        wcStream = wc.GetResponse.GetResponseStream

        doc.Load(wcStream)

        'Remove HTML from the document
        docNOHTML = RemoveUnWantedTags(doc)

        'Grab only the content inside the <body> tag
        node = docNOHTML.DocumentNode.SelectSingleNode("//body")

        'Output
        SpiderPage = New TermsRuns
        SpiderPage.html = node.InnerHtml
        SpiderPage.text = node.InnerText
        Return SpiderPage

    Catch ex As Exception
        'Something goes here when scraping returns an error
        SpiderPage = New TermsRuns
        SpiderPage.html = ""
        SpiderPage.text = ""

    End Try


End Function

***添加了InsertScrape：

Public Function InsertScrape(scrape_ts As DateTime, scrape_html As String, scrape_text As String, url_id As Integer, tas_id As Integer) As Boolean
    Dim myCommand As MySqlClient.MySqlCommand

    Dim dt As New DataTable

    'Create ds/dt for fill
    Dim ds As New DataSet
    Dim dtbl As New DataTable

    Try

        'Set Connection String
        myConn.ConnectionString = myConnectionString

        'Push Command to Client Object
        myCommand = New MySqlClient.MySqlCommand
        myCommand.Connection = myConn
        myCommand.CommandText = "spInsertScrape"
        myCommand.CommandType = CommandType.StoredProcedure
        myCommand.Parameters.AddWithValue("@scrape_ts", scrape_ts)
        myCommand.Parameters("@scrape_ts").Direction = ParameterDirection.Input
        myCommand.Parameters.AddWithValue("@scrape_html", scrape_html)
        myCommand.Parameters("@scrape_html").Direction = ParameterDirection.Input
        myCommand.Parameters.AddWithValue("@scrape_text", scrape_text)
        myCommand.Parameters("@scrape_text").Direction = ParameterDirection.Input
        myCommand.Parameters.AddWithValue("@url_id", url_id)
        myCommand.Parameters("@url_id").Direction = ParameterDirection.Input
        myCommand.Parameters.AddWithValue("@tas_id", tas_id)
        myCommand.Parameters("@tas_id").Direction = ParameterDirection.Input

        'Open Connection
        myConn.Open()
        myCommand.ExecuteNonQuery()


        'Close Connection
        myConn.Close()

        InsertScrape = True

    Catch ex As Exception
        'Put Message Here
        InsertScrape = False
        MessageBox.Show(ex.Message)
    End Try
End Function

提前感谢。

vb.net正确的方法来线程化这个应用程序

0 个答案: