我正在尝试创建一个简单的应用程序,它主要用于比较几个网站上的内容。我已经看到了一些方法来提取所有文本到应用程序。但有没有办法提取说,只有标题和描述。
以书籍网站为例。反正有搜索书名然后显示所有不同的评论,大纲,价格而没有任何无用的文本吗?
答案 0 :(得分:1)
快速而简单的解决方案是使用WebBrowser通过.Document
属性公开HtmlDocument。
Public Class Form1
Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
Me.WebBrowser1.ScriptErrorsSuppressed = True
Me.WebBrowser1.Navigate(New Uri("http://stackoverflow.com/"))
End Sub
Private Sub WebBrowser1_DocumentCompleted(sender As Object, e As WebBrowserDocumentCompletedEventArgs) Handles WebBrowser1.DocumentCompleted
Dim document As HtmlDocument = Me.WebBrowser1.Document
Dim title As String = Me.GetTitle(document)
Dim description As String = Me.GetMeta(document, "description")
Dim keywords As String = Me.GetMeta(document, "keywords")
Dim author As String = Me.GetMeta(document, "author")
End Sub
Private Function GetTitle(document As HtmlDocument) As String
Dim head As HtmlElement = Me.GetHead(document)
If (Not head Is Nothing) Then
For Each el As HtmlElement In head.GetElementsByTagName("title")
Return el.InnerText
Next
End If
Return String.Empty
End Function
Private Function GetMeta(document As HtmlDocument, name As String) As String
Dim head As HtmlElement = Me.GetHead(document)
If (Not head Is Nothing) Then
For Each el As HtmlElement In head.GetElementsByTagName("meta")
If (String.Compare(el.GetAttribute("name"), name, True) = 0) Then
Return el.GetAttribute("content")
End If
Next
End If
Return String.Empty
End Function
Private Function GetHead(document As HtmlDocument) As HtmlElement
For Each el As HtmlElement In document.GetElementsByTagName("head")
Return el
Next
Return Nothing
End Function
End Class