解析HTML getElementsByTagName不会返回所有单元格

时间:2019-03-21 13:57:04

标签: html excel vba web-scraping getelementsbytagname

我有一些代码可用于从网页上抓取数据,但是网页已更改,无法再使用。该代码应该在内部事务表上进行计算,但是getelementsbytagname("td")不再返回所有单元格。

我猜是因为它是嵌入页面或其他内容中的页面,但是我无法终生解决它,因为我对html不太熟悉。示例网页为gurufocus.com/stock/lmb/insider

我的代码如下:

Sub getStatements()
    Dim wb As Object
    Dim doc As Object
    Dim incomeStmtURLs As Variant
    Dim sURL As String
    Dim allCells As IHTMLElementCollection
    Dim aCell As HTMLTableCell
    Dim i As Integer
    Dim loginBoxData As String    

    Application.DisplayAlerts = False

    Call ToggleEvents(False)

    incomeStmtURLs = Range("Sheet1!h1:h2").Value

    For i = 1 To UBound(incomeStmtURLs)
        Set wb = CreateObject("internetExplorer.Application")
        sURL = incomeStmtURLs(i, 1)

        wb.navigate sURL
        wb.Visible = False

        While wb.Busy
            Application.Wait Now + #12:00:01 AM#
            DoEvents
        Wend

        'HTML document
        Set doc = wb.document

        On Error GoTo err_clear

        ' gets all cell and looks for date format,
        ' goes from new transaction to old so once gets to older than a year it exits for loop
        ' checks nextSibling from date is a buy and if so does calculations, by taking further value sin row
        ' for priceThisTime have to get rid of $ symbol for calculation
        Set allCells = doc.getElementsByTagName("td")
        For Each aCell In allCells
            MsgBox (aCell.innerText)
            If aCell.innerText Like "####-##-##" = True Then
                If CDate(aCell.innerText) >= Date - 365 Then
                    If aCell.NextSibling.innerText = "Buy" Then
                        buys = buys + 1
                        sharesThisTime = CDec(aCell.NextSibling.NextSibling.innerText)
                        priceThisTime = aCell.NextSibling.NextSibling.NextSibling.NextSibling.innerText
                        totalPrice = totalPrice + (sharesThisTime * CDec(Right(priceThisTime, Len(priceThisTime) - 1)))
                        shareCount = shareCount + sharesThisTime
                    End If
                Else
                    Exit For
                End If
            End If
        Next aCell

        Sheet6.Cells(i + 1, 2) = buys
        If (shareCount <> 0) Then
            Sheet6.Cells(i + 1, 3).Value = totalPrice / shareCount
        End If

        buys = 0
        totalPrice = 0
        shareCount = 0

err_clear:
        If Err <> 0 Then
            Err.Clear
            Resume Next
        End If
        wb.Quit
    Next i

    Call ToggleEvents(True)
End Sub

1 个答案:

答案 0 :(得分:0)

以下内容专门针对该表并检索所有td元素。我认为您的逻辑可能可以以列号为基础,但无论如何都可以应用(以防万一我也将表设置为变量)。

我将每页的结果设置为100,但是您可以将该行注释掉

Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls

Public Sub GetData()
    Dim ie As New InternetExplorer, lastDropDrownItemIndex As Long, dropDown As Object, t As Date
    Const MAX_WAIT_SEC As Long = 10


    With ie
        .Visible = True
        .Navigate2 "https://www.gurufocus.com/stock/lmb/insider"

        While .Busy Or .readyState < 4: DoEvents: Wend
        t = Timer

        Do
            Set dropDown = .document.querySelectorAll(".el-dropdown-menu__item")
            lastDropDrownItemIndex = dropDown.Length - 1
            If Timer - t > MAX_WAIT_SEC Then Exit Do
        Loop While lastDropDrownItemIndex < 1

        If dropDown.Length = 0 Then Exit Sub

        dropDown.item(lastDropDrownItemIndex).Click 'comment me out if don't want 100 results per page

        Dim tds As Object, table As Object
        Set tds = .document.getElementsByClassName("data-table")(0).getElementsByTagName("td")
        Set table = .document.getElementsByClassName("data-table")
        Stop
        .Quit
    End With
End Sub