VBA无法使用.getElementsByTag()或.getElementByID()从HTML获取数据

时间:2018-11-30 10:52:59

标签: html vba web-scraping getelementbyid getelementsbytagname

我当前的项目包括从HTML源代码中检索数据。 具体来说,我正在此网站上查看崩溃案例:

https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=112007272

我想通过查找特定标签/ ID的.innertext来从HTML收集所有相关数据。

到目前为止,我的代码:

Sub ExtractData()

mystart:

'First I create two Internet Explorer object

Set objIE = CreateObject("InternetExplorer.Application")      'this browser contains the list of cases
objIE.Top = 0
objIE.Left = 0
objIE.Width = 1600
objIE.Height = 900
objIE.Visible = True 'We can see IE

Set objIEdata = CreateObject("InternetExplorer.Application")    'this browser opens the specific case
objIEdata.Top = 0
objIEdata.Left = 0
objIEdata.Width = 1600
objIEdata.Height = 900
objIEdata.Visible = True 'We can see IE

On Error Resume Next
objIE.navigate ("https://crashviewer.nhtsa.dot.gov/LegacyCDS/Index")        'url of website

Do
    DoEvents
    If Err.Number <> 0 Then
        objIE.Quit
        Set objIE = Nothing
        GoTo mystart:
    End If
Loop Until objIE.readystate = 4

'we define an object variable Alllinks and loop through all the links to search for

Set aAlllinks = objIE.document.getElementsByTagName("button")                'looks for Search Button 
For Each Hyperlink In aAlllinks
    If Hyperlink.innertext = " Search" Then
        Hyperlink.Click
        Exit For
    Else
        MsgBox "Search Button was not found. Please improve code!"
    End If

Next

Application.Wait (Now + TimeValue("0:00:02"))

Set bAlllinks = objIE.document.getElementsByTagName("a")                     'all Hyperlinks on webpage start with Tag "a"
For Each Hyperlink In bAlllinks
    If UBound(Split(Hyperlink.innertext, "-")) = 2 And Len(Hyperlink.innertext) = 11 Then             'case specific to find the Hyperlinks which contain cases
        Debug.Print Hyperlink.innertext

        '2nd IE is used for each case

restart:
            objIEdata.navigate (Hyperlink.href)        'url of each case

            Do
                DoEvents
                If Err.Number <> 0 Then
                    objIEdata.Quit
                    Set objIE = Nothing
                    GoTo restart:
                End If
            Loop Until objIEdata.readystate = 4

            Set register = objIEdata.document.getElementByTagName("tbody")             'objIEdata.document.getElementByID("main").getElementByID("mainSection")  '.getElementByID("bodyMain").getElementsByTagName("tbody")
            For Each untermenue In register
                Debug.Print untermenue.innerHTML
            Next

            Application.Wait (Now + TimeValue("0:00:02"))




    End If
Next




objIE.Quit
objIEdata.Quit

End Sub

请注意,IE的可见性仅出于调试原因。

让我感到困惑的部分是

Set register = objIEdata.document.getElementByTagName("tbody")

如果我寻找.TagName("tbody"),则变量寄存器返回空,如果我寻找.ID("bodyMain"),也会发生同样的情况。不幸的是,我不熟悉HTML以及VBA如何与HTML文档进行交互。我的印象是,如果碰巧有一个元素,我可以按它们的ID处理所有元素,但这似乎行不通。

我是否需要通过HTML的“分支”来工作,或者代码应该能够找到每个ID,无论在哪个“子”中都能找到它?

非常感谢

1 个答案:

答案 0 :(得分:1)

您要问的是一个很大的请求,因此我将提供一些指针和起始代码。我的代码应该写出所有表,但是您将需要尝试获得所需的格式。有效地选择元素周围肯定有足够的逻辑对此有帮助。 *我还没有测试过使用该类遍历所有检索到的id的时间限制,但是已经测试了个别情况和所有id的检索。


要获取初始案例链接和ID:

我可能会使用一个函数来返回包含链接和ID的数组。如果您提取ID,则可以将它们传递给XMLHTTP请求,如下所示。

URL为https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search

Public Function GetLinksAndIds(ByVal URL) As Variant
    Dim ie As InternetExplorer, i As Long
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .navigate2 URL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

         While .Busy Or .readyState < 4: DoEvents: Wend

        Dim caseLinks As Object, id As String, newURL As String
        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
           linksAndIds(i + 1, 1) = caseLinks.item(i)
           linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        .Quit
    End With
    GetLinksAndIds = linksAndIds
End Function

示例返回值:

enter image description here


对于每种情况-使用XMLHTTP:

我很想避免使用IE,而是使用XMLHTTP request(使用print选项返回更可读的页面版本的URL编码查询字符串)。尽管我使用css选择器进行了解析,但是您可以将响应读入MSXML2.DOMDocument60并使用XPath查询。您可以将caseid连接到URL。

Option Explicit
Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=112007272&year=&fullimage=false", False '<==concatenate caseid into URL
        .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
        .send
        sResponse = .responseText
    End With

    Set html = New HTMLDocument
    html.body.innerHTML = sResponse
    Dim tables As Object, i As Long
    Set tables = html.querySelectorAll("table")
    For i = 0 To tables.Length - 1
        clipboard.SetText tables.item(i).outerHTML
        clipboard.PutInClipboard
        ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
    Next
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm '<< Function below modified from here

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function

使用一个类来保存xmlhttp对象,看起来完全(未经测试)是什么:

clsHTTP类:

Option Explicit

Private http As Object

Private Sub Class_Initialize()
    Set http = CreateObject("MSXML2.XMLHTTP")
End Sub

Public Function GetString(ByVal URL As String) As String
    Dim sResponse As String
    With http
        .Open "GET", URL, False
        .send
        sResponse = .responseText
    End With
End Function

标准模块1:

Option Explicit
Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Dim initialLinksURL As String, http As clsHTTP, i As Long, j As Long, newURL As String
    Set http = New clsHTTP
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
    Set html = New HTMLDocument
    initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"

    Dim linksAndIds()
    linksAndIds = GetLinksAndIds(initialLinksURL)

    For i = LBound(linksAndIds, 2) To UBound(linksAndIds, 2)

        newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
        html.body.innerHTML = http.GetString(newURL)
        Dim tables As Object

        Set tables = html.querySelectorAll("table")

        For j = 0 To tables.Length - 1
            clipboard.SetText tables.item(j).outerHTML
            clipboard.PutInClipboard
            ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
        Next
    Next
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function

Public Function GetLinksAndIds(ByVal URL) As Variant
    Dim ie As InternetExplorer, i As Long
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .navigate URL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

         While .Busy Or .readyState < 4: DoEvents: Wend

        Dim caseLinks As Object, id As String, newURL As String
        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
           linksAndIds(i + 1, 1) = caseLinks.item(i)
           linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        .Quit
    End With
    GetLinksAndIds = linksAndIds
End Function

所有Internet Explorer选项:

Option Explicit

Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Dim initialLinksURL As String, i As Long, j As Long, newURL As String
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
    Set html = New HTMLDocument
    initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"

    Dim ie As InternetExplorer, caseLinks As Object
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .Navigate2 initialLinksURL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

        While .Busy Or .readyState < 4: DoEvents: Wend

        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
            linksAndIds(i + 1, 1) = caseLinks.item(i)
            linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        For i = LBound(linksAndIds, 2) To 2      ' UBound(linksAndIds, 2)

            newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
            .Navigate2 newURL

            While .Busy Or .readyState < 4: DoEvents: Wend

            Dim tables As Object

            Set tables = .document.querySelectorAll("table")

            For j = 0 To tables.Length - 1
                clipboard.SetText tables.item(j).outerHTML
                clipboard.PutInClipboard
                ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
            Next
        Next

        .Quit
    End With
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function