我发现这个article解释了如何使用Excel VBA从网站上抓取某些标签。
以下代码从其找到的第一个<p>
标记中获取内容:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.title
On Error GoTo err_clear
Cells(i, 3) = doc.GetElementsByTagName("p")(0).innerText
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
我想让抓取工具获取网页上<p>
标记内的所有内容。所以我想缺少某种foreach
功能。
如何收集多个<p>
标记的内容?
更新 工作代码!
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
counter = counter + 1
Cells(i, counter + 2).Value = Cells(counter + 1).Value & el.innerText
Next el
counter = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 10)).Columns.AutoFit
Next i
End Sub
答案 0 :(得分:6)
你快到了! doc.GetElementsByTagName("p")
返回HTMLParagraphElement
个对象的集合,您使用doc.GetElementsByTagName("p")(0)
访问了第一个条目。正如您所提到的,For Each
循环可让您依次访问每个循环:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
Cells(i, 3).Value = Cells(i, 3).Value & ", " & el.innerText
Next el
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
答案 1 :(得分:0)
如果您只需要获取纯文本形式的网页内容,则此代码更加简洁
Function WEBSITE_TEXT(Destination As String) As String
' Requires a reference to Microsoft XML, v6.0
' Draws on the stackoverflow answer at bit.ly/parseXML
Dim myRequest As XMLHTTP60
Dim myDomDoc As DOMDocument60
' Check and clean inputs
On Error GoTo exitRoute
If Destination = "" Then
WEBSITE_TEXT = ""
Exit Function
End If
' Read the XML data from the Google Maps API
Set myRequest = New XMLHTTP60
myRequest.Open "GET", Destination, False
myRequest.send
' Parse HTML content
Dim html As New HTMLDocument
Dim text As String
html.body.innerHTML = myRequest.responseText
' Return the website content
text = html.body.innerText
If Not html Is Nothing Then WEBSITE_TEXT = text
exitRoute:
' Tidy up
text = ""
Set myRequest = Nothing
End Function