我一直在学习有关使用Excel和VBA从网页抓取数据的知识。我使用一个数据源跨越了一点障碍,因此更改为https://www.skyscanner.net/news/airports/heathrow-airport-live-flight-arrivals-and-departures。 我要面对的问题是在表ID“ flight-status-board-arrivals”中有一个和。 我可以很好地提取标头数据,但是当我尝试放大正文数据时,循环结束。我尝试更改为.children(1)来使用返回0的length进行测试和调试。这表明对象中没有任何内容,但我不明白为什么。我希望我已经涵盖了这里的所有内容,并且可能会有任何帮助。另外,我知道这可以使用另一种语言来实现,但是当我学习VBA时,我认为最好在学习新语言之前先使用VBa学习。
Sub GrabWebData()
Dim ie As InternetExplorer 'refer to the running copy of internet explorer
Dim html As HTMLDocument 'refer to the HTML document returned
Dim ele As Object
Dim y As Integer
Dim fSht As Worksheet
Set fSht = Sheets("Sheet1")
Set ie = New InternetExplorer
ie.Visible = False
ie.navigate "https://www.skyscanner.net/news/airports/heathrow-airport-live-flight-arrivals-and-departures"
'wait until IE is done loading page
Do While ie.READYSTATE <> READYSTATE_COMPLETE
Application.StatusBar = "Loading Flight Times"
DoEvents
Loop
y = 1
Debug.Print ie.document.getElementById("flight-status-board-arrivals").Children(1) _
.getElementsByTagName("td").Length
For Each ele In ie.document.getElementById("flight-status-board- arrivals").Children(1) _
.getElementsByTagName("tr")
Debug.Print ele.textContent
fSht.Range("A" & y).Value = ele.Children(0).textContent
'On Error GoTo skip1:
fSht.Range("B" & y).Value = ele.Children(1).textContent
'On Error GoTo skip1:
fSht.Range("C" & y).Value = ele.Children(2).textContent
'On Error GoTo skip1:
fSht.Range("D" & y).Value = ele.Children(3).textContent
'On Error GoTo skip1:
fSht.Range("E" & y).Value = ele.Children(4).textContent
'On Error GoTo skip1:
fSht.Cells.WrapText = False
fSht.Rows.AutoFit
fSht.Columns.AutoFit
'skip1:
y = y + 1
Next
'Rows(2).Select
'Selection.Delete shift:=xlUp
End Sub`
答案 0 :(得分:1)
以下内容将帮助您入门。它使用selenium basic。安装后,您需要添加对硒类型库和HTML对象库的引用。
我很着急,所以我稍后会再提炼。
Option Explicit
Public Sub GetInfo()
Dim d As WebDriver, hTable As HTMLTable, html As HTMLDocument, doc As WebElement, headers(), b As Object
headers = Array("Flight Details", "Status", "Scheduled Time", "Airline Flight", "Origin", "Terminal", "Status")
Set d = New ChromeDriver
Const URL = "https://www.skyscanner.net/news/airports/heathrow-airport-live-flight-arrivals-and-departures"
Application.ScreenUpdating = False
With d
.Start "Chrome"
.Get URL
Set html = New HTMLDocument
Set b = .FindElementById("flight-status-board-arrivals") '<== Only used to take advantage of implicit waits in Selenium. This is a TODO improve.
html.body.innerHTML = .findElementByXPath("//body").Attribute("innerHTML")
Set hTable = html.getElementById("flight-status-board-arrivals")
WriteTable hTable, headers
.Quit
Application.ScreenUpdating = True
End With
End Sub
Public Sub WriteTable(ByVal hTable As HTMLTable, ByRef headers As Variant, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long, tBody As Object
r = startRow
With ws
Set tRow = hTable.getElementsByTagName("tr") 'HTMLTableRow
For Each tr In tRow
Set tCell = tr.getElementsByTagName("td")
For Each td In tCell 'DispHTMLElementCollection
.Cells(r, c).Value = td.innerText 'HTMLTableCell
c = c + 1
Next td
r = r + 1: c = 1
Next tr
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
End With
End Sub
网页的当前示例快照:
当前代码输出的示例快照:
注意:
在检查页面时,将显示其他信息(上面屏幕截图中显示的可见列之前的前2列):
代码捕获了所有这些。
我很想知道是否可以从webElements转移innerHTML
,以便使用.document
或其他HTML DOM对象的属性。我在@Nerijus的答案中找到了解决方法。
答案 1 :(得分:1)
尝试以下代码从该表获取数据。我已经在脚本中定义了Explicit Wait
,以便它将等待直到该网页中的表格数据可用为止。尽管我强烈建议您按照QHarr
所示的方式进行操作,但是您可以额外尝试一下。该脚本将无头运行,因此您将看不到任何浏览器。但是,唯一的问题是您可能会在操作中某处遇到stale element
错误,但并非总是如此。
这是脚本的样子:
Sub FetchData()
Const link As String = "https://www.skyscanner.net/news/airports/heathrow-airport-live-flight-arrivals-and-departures"
Dim posts As Object, post As Object, elem As Object, R&, C&
With New ChromeDriver
.AddArgument "--headless"
.get link
Set posts = .FindElementByCss("#flight-status-board-arrivals tbody tr", Timeout:=30000)
For Each post In .FindElementsByCss("#flight-status-board-arrivals tr")
For Each elem In post.FindElementsByCss("th,td")
C = C + 1: Cells(R + 1, C) = elem.Text
Next elem
C = 0: R = R + 1
Next post
End With
End Sub
要使执行时间大大缩短并重新使用HTMLDocument
返回到.PageSource
解析器,您应该尝试如下操作。
Sub FetchData()
Const link As String = "https://www.skyscanner.net/news/airports/heathrow-airport-live-flight-arrivals-and-departures"
Dim posts As Object, post As Object, elem As Object, R&, C&
Dim Html As New HTMLDocument
With New ChromeDriver
.AddArgument "--headless"
.get link
Set posts = .FindElementByCss("#flight-status-board-arrivals tbody tr", timeout:=30000)
Html.body.innerHTML = .PageSource 'this is how you can go
End With
For Each post In Html.getElementById("flight-status-board-arrivals").Rows
For Each elem In post.Cells
C = C + 1: Cells(R + 1, C) = elem.innerText
Next elem
C = 0: R = R + 1
Next post
End Sub
在执行前添加到库的引用:
Selenium Type Library
Microsoft HTML Object Library