如何从多个网页中提取列出的数据 - 无法找到表格标签

时间:2017-07-26 08:45:39

标签: excel vb.net web web-scraping webpage




' return the document containg the DOM of the page strWebAddress
' returns Nothing if the timeout lngTimeoutInSeconds was reached
Public Function GetIEDocument(ByVal strWebAddress As String, Optional ByVal lngTimeoutInSeconds As Long = 15) As MSHTML.HTMLDocument
Dim IE As SHDocVw.InternetExplorer
Dim IEDocument As MSHTML.HTMLDocument
Dim dateNow As Date

' create an IE application, representing a tab
Set IE = New SHDocVw.InternetExplorer

' optionally make the application visible, though it will work perfectly fine in the background otherwise
IE.Visible = True

' open a webpage in the tab represented by IE and wait until the main request successfully finished
' times out after lngTimeoutInSeconds with a warning
IE.Navigate strWebAddress
dateNow = Now
Do While IE.Busy
    If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Function

' retrieve the webpage's content (that is, the HTML DOM) and wait until everything is loaded (images, etc.)
' times out after lngTimeoutInSeconds with a warning
Set IEDocument = IE.Document
dateNow = Now
Do While IEDocument.ReadyState <> "complete"
    If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Function

Set GetIEDocument = IEDocument
End Function


Public Sub GetTeamData()
Dim strWebAddress As String
Dim strH2AnchorContent As String
Dim IEDocument As MSHTML.HTMLDocument
Dim objH2 As MSHTML.HTMLHeaderElement
Dim objTable As MSHTML.HTMLTable
Dim objRow As MSHTML.HTMLTableRow
Dim objCell As MSHTML.HTMLTableCell
Dim lngRow As Long
Dim lngColumn As Long

' initialize some variables that should probably better be passed as paramaters or defined as constants
strWebAddress = "https://toolkit.financialexpress.net/santanderam"
strH2AnchorContent = "   "

' open page
Set IEDocument = GetIEDocument(strWebAddress)
If IEDocument Is Nothing Then
    MsgBox "Timeout reached opening this address:" & vbNewLine & strWebAddress, vbCritical
    Exit Sub
End If

' retrieve anchor element
For Each objH2 In IEDocument.getElementsByTagName("h2")
   If objH2.innerText = strH2AnchorContent Then Exit For
Next objH2
If objH2 Is Nothing Then
    MsgBox "Could not find """ & strH2AnchorContent & """ in DOM!", vbCritical
    Exit Sub
End If

' traverse HTML tree to desired table element
' * move up one element in the hierarchy
' * skip two elements to proceed to the third (interjected each time with whitespace that is interpreted as an element of its own)
' * move down two elements n the hierarchy
Set objTable = objH2.parentElement _
                    .NextSibling.NextSibling _
                    .NextSibling.NextSibling _
                    .NextSibling.NextSibling _
                    .Children(0) _

 '   iterate over the table and output its contents
lngRow = 1
 For Each objRow In objTable.Rows
    lngColumn = 1
     For Each objCell In objRow.Cells
         Cells(lngRow, lngColumn) = objCell.innerText
        lngColumn = lngColumn + 1
     Next objCell
     lngRow = lngRow + 1
End Sub


 strH2AnchorContent = "  "






    ' open a webpage in the tab represented by IE and wait until the main request successfully finished
' times out after lngTimeoutInSeconds with a warning
IE.Navigate strWebAddress
dateNow = Now
Do While IE.Busy
    If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Function

' retrieve the webpage's content (that is, the HTML DOM) and wait until everything is loaded (images, etc.)
' times out after lngTimeoutInSeconds with a warning
Set IEDocument = IE.Document
dateNow = Now
Do While IEDocument.ReadyState <> "complete"
    If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Function

Set GetIEDocument = IEDocument
End Function

Public Sub GetTeamData()
Dim strWebAddress As String
Dim strH2AnchorContent As String
Dim IEDocument As MSHTML.HTMLDocument
Dim objH2 As MSHTML.HTMLHeaderElement
Dim obTable As MSHTML.HTMLTable
Dim objRow As MSHTML.HTMLTableRow
Dim objCell As MSHTML.HTMLTableCell
Dim lngRow As Long
Dim lngColumn As Long

' initialize some variables that should probably better be passed as paramaters or defined as constants
strWebAddress = "https://toolkit.financialexpress.net/santanderam"

' open page
Set IEDocument = GetIEDocument(strWebAddress)
If IEDocument Is Nothing Then
    MsgBox "Timeout reached opening this address:" & vbNewLine &     strWebAddress, vbCritical
    Exit Sub
End If

' retrieve anchor element
Set oTable = IEDocument.getElementById("Price_1_1")
Debug.Print oTable.innerText

' iterate over the table and output its contents
lngRow = 1
For Each objRow In oTable.Rows
    lngColumn = 1
    For Each objCell In objRow.Cells
        Cells(lngRow, lngColumn) = objCell.innerText
        lngColumn = lngColumn + 1
    Next objCell
    lngRow = lngRow + 1
End Sub

1 个答案:

答案 0 :(得分:0)

您的代码运行正常,问题是您在加载表之前尝试从表中捕获数据。我添加了一个简单的 Wait 循环5秒钟,您当前的代码捕获了数据。以下是我在 Set oTable = IEDocument.getElementById("Price_1_1") 声明之前添加的循环:

dateNow = Now
bExitLoop = False
lngTimeoutInSeconds = 5
Do While Not bExitLoop
    If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Do
