我当前的项目包括从HTML源代码中检索数据。 具体来说,我正在此网站上查看崩溃案例:
https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=112007272
我想通过查找特定标签/ ID的.innertext
来从HTML收集所有相关数据。
到目前为止,我的代码:
Sub ExtractData()
mystart:
'First I create two Internet Explorer object
Set objIE = CreateObject("InternetExplorer.Application") 'this browser contains the list of cases
objIE.Top = 0
objIE.Left = 0
objIE.Width = 1600
objIE.Height = 900
objIE.Visible = True 'We can see IE
Set objIEdata = CreateObject("InternetExplorer.Application") 'this browser opens the specific case
objIEdata.Top = 0
objIEdata.Left = 0
objIEdata.Width = 1600
objIEdata.Height = 900
objIEdata.Visible = True 'We can see IE
On Error Resume Next
objIE.navigate ("https://crashviewer.nhtsa.dot.gov/LegacyCDS/Index") 'url of website
Do
DoEvents
If Err.Number <> 0 Then
objIE.Quit
Set objIE = Nothing
GoTo mystart:
End If
Loop Until objIE.readystate = 4
'we define an object variable Alllinks and loop through all the links to search for
Set aAlllinks = objIE.document.getElementsByTagName("button") 'looks for Search Button
For Each Hyperlink In aAlllinks
If Hyperlink.innertext = " Search" Then
Hyperlink.Click
Exit For
Else
MsgBox "Search Button was not found. Please improve code!"
End If
Next
Application.Wait (Now + TimeValue("0:00:02"))
Set bAlllinks = objIE.document.getElementsByTagName("a") 'all Hyperlinks on webpage start with Tag "a"
For Each Hyperlink In bAlllinks
If UBound(Split(Hyperlink.innertext, "-")) = 2 And Len(Hyperlink.innertext) = 11 Then 'case specific to find the Hyperlinks which contain cases
Debug.Print Hyperlink.innertext
'2nd IE is used for each case
restart:
objIEdata.navigate (Hyperlink.href) 'url of each case
Do
DoEvents
If Err.Number <> 0 Then
objIEdata.Quit
Set objIE = Nothing
GoTo restart:
End If
Loop Until objIEdata.readystate = 4
Set register = objIEdata.document.getElementByTagName("tbody") 'objIEdata.document.getElementByID("main").getElementByID("mainSection") '.getElementByID("bodyMain").getElementsByTagName("tbody")
For Each untermenue In register
Debug.Print untermenue.innerHTML
Next
Application.Wait (Now + TimeValue("0:00:02"))
End If
Next
objIE.Quit
objIEdata.Quit
End Sub
请注意,IE的可见性仅出于调试原因。
让我感到困惑的部分是
Set register = objIEdata.document.getElementByTagName("tbody")
。
如果我寻找.TagName("tbody")
,则变量寄存器返回空,如果我寻找.ID("bodyMain")
,也会发生同样的情况。不幸的是,我不熟悉HTML以及VBA如何与HTML文档进行交互。我的印象是,如果碰巧有一个元素,我可以按它们的ID处理所有元素,但这似乎行不通。
我是否需要通过HTML的“分支”来工作,或者代码应该能够找到每个ID,无论在哪个“子”中都能找到它?
非常感谢
答案 0 :(得分:1)
您要问的是一个很大的请求,因此我将提供一些指针和起始代码。我的代码应该写出所有表,但是您将需要尝试获得所需的格式。有效地选择元素周围肯定有足够的逻辑对此有帮助。 *我还没有测试过使用该类遍历所有检索到的id的时间限制,但是已经测试了个别情况和所有id的检索。
要获取初始案例链接和ID:
我可能会使用一个函数来返回包含链接和ID的数组。如果您提取ID,则可以将它们传递给XMLHTTP请求,如下所示。
URL为https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search
Public Function GetLinksAndIds(ByVal URL) As Variant
Dim ie As InternetExplorer, i As Long
Set ie = New InternetExplorer
With ie
.Visible = True
.navigate2 URL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Dim caseLinks As Object, id As String, newURL As String
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
.Quit
End With
GetLinksAndIds = linksAndIds
End Function
示例返回值:
对于每种情况-使用XMLHTTP:
我很想避免使用IE,而是使用XMLHTTP
request(使用print选项返回更可读的页面版本的URL编码查询字符串)。尽管我使用css选择器进行了解析,但是您可以将响应读入MSXML2.DOMDocument60
并使用XPath
查询。您可以将caseid连接到URL。
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=112007272&year=&fullimage=false", False '<==concatenate caseid into URL
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = .responseText
End With
Set html = New HTMLDocument
html.body.innerHTML = sResponse
Dim tables As Object, i As Long
Set tables = html.querySelectorAll("table")
For i = 0 To tables.Length - 1
clipboard.SetText tables.item(i).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm '<< Function below modified from here
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
使用一个类来保存xmlhttp对象,看起来完全(未经测试)是什么:
clsHTTP类:
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal URL As String) As String
Dim sResponse As String
With http
.Open "GET", URL, False
.send
sResponse = .responseText
End With
End Function
标准模块1:
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Dim initialLinksURL As String, http As clsHTTP, i As Long, j As Long, newURL As String
Set http = New clsHTTP
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set html = New HTMLDocument
initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"
Dim linksAndIds()
linksAndIds = GetLinksAndIds(initialLinksURL)
For i = LBound(linksAndIds, 2) To UBound(linksAndIds, 2)
newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
html.body.innerHTML = http.GetString(newURL)
Dim tables As Object
Set tables = html.querySelectorAll("table")
For j = 0 To tables.Length - 1
clipboard.SetText tables.item(j).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
Next
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
Public Function GetLinksAndIds(ByVal URL) As Variant
Dim ie As InternetExplorer, i As Long
Set ie = New InternetExplorer
With ie
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Dim caseLinks As Object, id As String, newURL As String
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
.Quit
End With
GetLinksAndIds = linksAndIds
End Function
所有Internet Explorer选项:
Option Explicit
Public Sub GetTables()
Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
Dim initialLinksURL As String, i As Long, j As Long, newURL As String
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set html = New HTMLDocument
initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"
Dim ie As InternetExplorer, caseLinks As Object
Set ie = New InternetExplorer
With ie
.Visible = True
.Navigate2 initialLinksURL
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("btnSubmit1").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")
Dim linksAndIds()
ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
For i = 0 To caseLinks.Length - 1
linksAndIds(i + 1, 1) = caseLinks.item(i)
linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
Next
For i = LBound(linksAndIds, 2) To 2 ' UBound(linksAndIds, 2)
newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
.Navigate2 newURL
While .Busy Or .readyState < 4: DoEvents: Wend
Dim tables As Object
Set tables = .document.querySelectorAll("table")
For j = 0 To tables.Length - 1
clipboard.SetText tables.item(j).outerHTML
clipboard.PutInClipboard
ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
Next
Next
.Quit
End With
End Sub
'https://www.rondebruin.nl/win/s9/win005.htm
Public Function LastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
LastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function