使用VBA宏

时间:2017-02-28 18:14:08

标签: json vba excel-vba web-scraping xmlhttprequest

我试图从网站上抓取多个表格。到目前为止,我已经构建了一个excel VBA宏来执行此操作。我还想出了如何在网站的多个页面上获取所有数据。例如,如果我有1000个结果,但每页显示50个。问题是我在多个页面上有相同的5个表,因为每个表有1000个结果。

我的代码只能遍历1个表的每个页面。我也有编写代码来抓取每个表,但我无法弄清楚如何为50个搜索结果(每个页面)中的每一个执行此操作。

如何循环遍历多个表并单击流程中的下一页以捕获所有数据?

Sub ETFDat()

    Dim IE As Object
    Dim i As Long
    Dim strText As String
    Dim jj As Long
    Dim hBody As Object
    Dim hTR As Object
    Dim hTD As Object
    Dim tb As Object
    Dim bb As Object
    Dim Tr As Object
    Dim Td As Object
    Dim ii As Long
    Dim doc As Object
    Dim hTable As Object
    Dim y As Long
    Dim z As Long
    Dim wb As Excel.Workbook
    Dim ws As Excel.Worksheet

    Set wb = Excel.ActiveWorkbook
    Set ws = wb.ActiveSheet
    Set IE = CreateObject("InternetExplorer.Application")
    IE.Visible = True
    y = 1   'Column A in Excel
    z = 1   'Row 1 in Excel
    Sheets("Fund Basics").Activate
    Cells.Select
    Selection.Clear

    IE.navigate "http://www.etf.com/channels/smart-beta-etfs/channels/smart-       beta-etfs?qt-tabs=0#qt-tabs" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
    Do While IE.busy: DoEvents: Loop
    Do While IE.ReadyState <> 4: DoEvents: Loop
    Set doc = IE.document
    Set hTable = doc.getElementsByTagName("table")    '.GetElementByID("tablePerformance")
    ii = 1
    Do While ii <= 17
        For Each tb In hTable
            Set hBody = tb.getElementsByTagName("tbody")
            For Each bb In hBody
                Set hTR = bb.getElementsByTagName("tr")
                For Each Tr In hTR
                    Set hTD = Tr.getElementsByTagName("td")
                    y = 1 ' Resets back to column A
                    For Each Td In hTD
                        ws.Cells(z, y).Value = Td.innerText
                        y = y + 1
                    Next Td
                    DoEvents
                    z = z + 1
                Next Tr
                Exit For
            Next bb
            Exit For
        Next tb
        With doc
            Set elems = .getElementsByTagName("a")
            For Each e In elems
                If (e.getAttribute("id") = "nextPage") Then
                    e.Click
                    Exit For
                End If
            Next e
        End With
        ii = ii + 1
        Application.Wait (Now + TimeValue("00:00:05"))
    Loop

    MsgBox "Done"

End Sub

1 个答案:

答案 0 :(得分:0)

有一个示例显示如何使用XHR和JSON解析从网站检索数据,它包含几个步骤。

  1. 检索数据。
  2. 我使用Chrome开发者工具网络标签了解了XHR。 我找到的大多数相关数据是GET XHR在点击下一页按钮后从http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/50/50/1返回的JSON字符串:

    GET XHR

    响应具有以下结构for single row item

    [
      {
        "productId": 576,
        "fund": "iShares Russell 1000 Value ETF",
        "ticker": "IWD",
        "inceptionDate": "2000-05-22",
        "launchDate": "2000-05-22",
        "hasSegmentReport": "true",
        "genericReport": "false",
        "hasReport": "true",
        "fundsInSegment": 20,
        "economicDevelopment": "Developed Markets",
        "totalRows": 803,
        "fundBasics": {
          "issuer": "<a href='/channels/blackrock-etfs' alt='BlackRock'>BlackRock</a>",
          "expenseRatio": {
            "value": 20
          },
          "aum": {
            "value": 36957230250
          },
          "spreadPct": {
            "value": 0.000094
          },
          "segment": "Equity: U.S. - Large Cap Value"
        },
        "performance": {
          "priceTrAsOf": "2017-02-27",
          "priceTr1Mo": {
            "value": 0.031843
          },
          "priceTr3Mo": {
            "value": 0.070156
          },
          "priceTr1Yr": {
            "value": 0.281541
          },
          "priceTr3YrAnnualized": {
            "value": 0.099171
          },
          "priceTr5YrAnnualized": {
            "value": 0.13778
          },
          "priceTr10YrAnnualized": {
            "value": 0.061687
          }
        },
        "analysis": {
          "analystPick": null,
          "opportunitiesList": null,
          "letterGrade": "A",
          "efficiencyScore": 97.977103,
          "tradabilityScore": 99.260541,
          "fitScore": 84.915658,
          "leveragedFactor": null,
          "exposureReset": null,
          "avgDailyDollarVolume": 243848188.037378,
          "avgDailyShareVolume": 2148400.688889,
          "spread": {
            "value": 0.010636
          },
          "fundClosureRisk": "Low"
        },
        "fundamentals": {
          "dividendYield": {
            "value": 0.021543
          },
          "equity": {
            "pe": 27.529645,
            "pb": 1.964124
          },
          "fixedIncome": {
            "duration": null,
            "creditQuality": null,
            "ytm": {
              "value": null
            }
          }
        },
        "classification": {
          "assetClass": "Equity",
          "strategy": "Value",
          "region": "North America",
          "geography": "U.S.",
          "category": "Size and Style",
          "focus": "Large Cap",
          "niche": "Value",
          "inverse": "false",
          "leveraged": "false",
          "etn": "false",
          "selectionCriteria": "Multi-Factor",
          "weightingScheme": "Multi-Factor",
          "activePerSec": "false",
          "underlyingIndex": "Russell 1000 Value Index",
          "indexProvider": "Russell",
          "brand": "iShares"
        },
        "tax": {
          "legalStructure": "Open-Ended Fund",
          "maxLtCapitalGainsRate": 20,
          "maxStCapitalGainsRate": 39.6,
          "taxReporting": "1099"
        }
      }
    ]
    
    1. 属性"totalRows": 803指定总行数。因此,为了尽可能快地进行数据检索,最好使请求获得第一行。正如您在URL中看到的那样,有../-aum/50/50/..尾部,它指向排序顺序,要开始的项目以及要返回的总项目。因此,要获得唯一的行,它应该是http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1

    2. 解析检索到的JSON,从totalRows属性中获取总行数。

    3. 再提出一个请求来获取整个表格。

    4. 解析整个表JSON,将其转换为2d数组并输出。您可以通过直接访问阵列来执行进一步处理。

    5. 如下表所示:

      table

      结果表包含803行和带有列的标题,如下所示:

      productId
      fund
      ticker
      inceptionDate
      launchDate
      hasSegmentReport
      genericReport
      hasReport
      fundsInSegment
      economicDevelopment
      totalRows
      fundBasics_issuer
      fundBasics_expenseRatio_value
      fundBasics_aum_value
      fundBasics_spreadPct_value
      fundBasics_segment
      performance_priceTrAsOf
      performance_priceTr1Mo_value
      performance_priceTr3Mo_value
      performance_priceTr1Yr_value
      performance_priceTr3YrAnnualized_value
      performance_priceTr5YrAnnualized_value
      performance_priceTr10YrAnnualized_value
      analysis_analystPick
      analysis_opportunitiesList
      analysis_letterGrade
      analysis_efficiencyScore
      analysis_tradabilityScore
      analysis_fitScore
      analysis_leveragedFactor
      analysis_exposureReset
      analysis_avgDailyDollarVolume
      analysis_avgDailyShareVolume
      analysis_spread_value
      analysis_fundClosureRisk
      fundamentals_dividendYield_value
      fundamentals_equity_pe
      fundamentals_equity_pb
      fundamentals_fixedIncome_duration
      fundamentals_fixedIncome_creditQuality
      fundamentals_fixedIncome_ytm_value
      classification_assetClass
      classification_strategy
      classification_region
      classification_geography
      classification_category
      classification_focus
      classification_niche
      classification_inverse
      classification_leveraged
      classification_etn
      classification_selectionCriteria
      classification_weightingScheme
      classification_activePerSec
      classification_underlyingIndex
      classification_indexProvider
      classification_brand
      tax_legalStructure
      tax_maxLtCapitalGainsRate
      tax_maxStCapitalGainsRate
      tax_taxReporting
      

      将以下代码放入VBA Project标准模块:

      Option Explicit
      
      Sub GetData()
      
          Dim sJSONString As String
          Dim vJSON As Variant
          Dim sState As String
          Dim lRowsQty As Long
          Dim aData()
          Dim aHeader()
      
          ' Download and parse the only first row to get total rows qty
          sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1")
          JSON.Parse sJSONString, vJSON, sState
          lRowsQty = vJSON(0)("totalRows")
          ' Download and parse the entire data
          sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/" & lRowsQty & "/1")
          JSON.Parse sJSONString, vJSON, sState
          ' Convert JSON to 2d array
          JSON.ToArray vJSON, aData, aHeader
          ' Output
          With Sheets(1)
              .Cells.Delete
              OutputArray .Cells(1, 1), aHeader
              Output2DArray .Cells(2, 1), aData
              .Cells.Columns.AutoFit
          End With
      
      End Sub
      
      Function GetXHR(sURL As String) As String
      
          With CreateObject("MSXML2.XMLHTTP")
              .Open "GET", sURL, False
              .Send
              GetXHR = .responseText
          End With
      
      End Function
      
      Sub OutputArray(oDstRng As Range, aCells As Variant)
      
          With oDstRng
              .Parent.Select
              With .Resize( _
                      1, _
                      UBound(aCells) - LBound(aCells) + 1)
                  .NumberFormat = "@"
                  .Value = aCells
              End With
          End With
      
      End Sub
      
      Sub Output2DArray(oDstRng As Range, aCells As Variant)
      
          With oDstRng
              .Parent.Select
              With .Resize( _
                      UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                      UBound(aCells, 2) - LBound(aCells, 2) + 1)
                  .NumberFormat = "@"
                  .Value = aCells
              End With
          End With
      
      End Sub
      

      再创建一个标准模块,将其命名为JSON并将下面的代码放入其中,此代码提供JSON处理功能:

      Option Explicit
      
      Private sBuffer As String
      Private oTokens As Object
      Private oRegEx As Object
      Private bMatch As Boolean
      Private oChunks As Object
      Private oHeader As Object
      Private aData() As Variant
      Private i As Long
      
      Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
      
          ' Backus–Naur form JSON parser implementation based on RegEx
          ' Input:
          ' sSample - source JSON string
          ' Output:
          ' vJson - created object or array to be returned as result
          ' sState - string Object|Array|Error depending on processing
      
          sBuffer = sSample
          Set oTokens = CreateObject("Scripting.Dictionary")
          Set oRegEx = CreateObject("VBScript.RegExp")
          With oRegEx ' Patterns based on specification http://www.json.org/
              .Global = True
              .MultiLine = True
              .IgnoreCase = True ' Unspecified True, False, Null accepted
              .Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
              Tokenize "s"
              .Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
              Tokenize "d"
              .Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
              Tokenize "c"
              .Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
              Tokenize "n"
              .Pattern = "\s+"
              sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
              .MultiLine = False
              Do
                  bMatch = False
                  .Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
                  Tokenize "p"
                  .Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
                  Tokenize "o"
                  .Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
                  Tokenize "a"
              Loop While bMatch
              .Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
              If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
                  Retrieve sBuffer, vJSON
                  sState = IIf(IsObject(vJSON), "Object", "Array")
              Else
                  vJSON = Null
                  sState = "Error"
              End If
          End With
          Set oTokens = Nothing
          Set oRegEx = Nothing
      
      End Sub
      
      Private Sub Tokenize(sType)
      
          Dim aContent() As String
          Dim lCopyIndex As Long
          Dim i As Long
          Dim sKey As String
      
          With oRegEx.Execute(sBuffer)
              If .Count = 0 Then Exit Sub
              ReDim aContent(0 To .Count - 1)
              lCopyIndex = 1
              For i = 0 To .Count - 1
                  With .Item(i)
                      sKey = "<" & oTokens.Count & sType & ">"
                      oTokens(sKey) = .Value
                      aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
                      lCopyIndex = .FirstIndex + .Length + 1
                  End With
              Next
          End With
          sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
          bMatch = True
      
      End Sub
      
      Private Sub Retrieve(sTokenKey, vTransfer)
      
          Dim sTokenValue As String
          Dim sName As String
          Dim vValue As Variant
          Dim aTokens() As String
          Dim i As Long
      
          sTokenValue = oTokens(sTokenKey)
          With oRegEx
              .Global = True
              Select Case Left(Right(sTokenKey, 2), 1)
                  Case "o"
                      Set vTransfer = CreateObject("Scripting.Dictionary")
                      aTokens = Split(sTokenValue, "<")
                      For i = 1 To UBound(aTokens)
                          Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
                      Next
                  Case "p"
                      aTokens = Split(sTokenValue, "<", 4)
                      Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
                      Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
                      If IsObject(vValue) Then
                          Set vTransfer(sName) = vValue
                      Else
                          vTransfer(sName) = vValue
                      End If
                  Case "a"
                      aTokens = Split(sTokenValue, "<")
                      If UBound(aTokens) = 0 Then
                          vTransfer = Array()
                      Else
                          ReDim vTransfer(0 To UBound(aTokens) - 1)
                          For i = 1 To UBound(aTokens)
                              Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
                              If IsObject(vValue) Then
                                  Set vTransfer(i - 1) = vValue
                              Else
                                  vTransfer(i - 1) = vValue
                              End If
                          Next
                      End If
                  Case "n"
                      vTransfer = sTokenValue
                  Case "s"
                      vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
                          Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
                          "\""", """"), _
                          "\\", "\"), _
                          "\/", "/"), _
                          "\b", Chr(8)), _
                          "\f", Chr(12)), _
                          "\n", vbLf), _
                          "\r", vbCr), _
                          "\t", vbTab)
                      .Global = False
                      .Pattern = "\\u[0-9a-fA-F]{4}"
                      Do While .Test(vTransfer)
                          vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
                      Loop
                  Case "d"
                      vTransfer = Evaluate(sTokenValue)
                  Case "c"
                      Select Case LCase(sTokenValue)
                          Case "true"
                              vTransfer = True
                          Case "false"
                              vTransfer = False
                          Case "null"
                              vTransfer = Null
                      End Select
              End Select
          End With
      
      End Sub
      
      Function Serialize(vJSON As Variant) As String
      
          Set oChunks = CreateObject("Scripting.Dictionary")
          SerializeElement vJSON, ""
          Serialize = Join(oChunks.Items(), "")
          Set oChunks = Nothing
      
      End Function
      
      Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
      
          Dim aKeys() As Variant
          Dim i As Long
      
          With oChunks
              Select Case VarType(vElement)
                  Case vbObject
                      If vElement.Count = 0 Then
                          .Item(.Count) = "{}"
                      Else
                          .Item(.Count) = "{" & vbCrLf
                          aKeys = vElement.Keys
                          For i = 0 To UBound(aKeys)
                              .Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
                              SerializeElement vElement(aKeys(i)), sIndent & vbTab
                              If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
                              .Item(.Count) = vbCrLf
                          Next
                          .Item(.Count) = sIndent & "}"
                      End If
                  Case Is >= vbArray
                      If UBound(vElement) = -1 Then
                          .Item(.Count) = "[]"
                      Else
                          .Item(.Count) = "[" & vbCrLf
                          For i = 0 To UBound(vElement)
                              .Item(.Count) = sIndent & vbTab
                              SerializeElement vElement(i), sIndent & vbTab
                              If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
                              .Item(.Count) = vbCrLf
                          Next
                          .Item(.Count) = sIndent & "]"
                      End If
                  Case vbInteger, vbLong
                      .Item(.Count) = vElement
                  Case vbSingle, vbDouble
                      .Item(.Count) = Replace(vElement, ",", ".")
                  Case vbNull
                      .Item(.Count) = "null"
                  Case vbBoolean
                      .Item(.Count) = IIf(vElement, "true", "false")
                  Case Else
                      .Item(.Count) = """" & _
                          Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
                              "\", "\\"), _
                              """", "\"""), _
                              "/", "\/"), _
                              Chr(8), "\b"), _
                              Chr(12), "\f"), _
                              vbLf, "\n"), _
                              vbCr, "\r"), _
                              vbTab, "\t") & _
                          """"
              End Select
          End With
      
      End Sub
      
      Function ToString(vJSON As Variant) As String
      
          Select Case VarType(vJSON)
              Case vbObject, Is >= vbArray
                  Set oChunks = CreateObject("Scripting.Dictionary")
                  ToStringElement vJSON, ""
                  oChunks.Remove 0
                  ToString = Join(oChunks.Items(), "")
                  Set oChunks = Nothing
              Case vbNull
                  ToString = "Null"
              Case vbBoolean
                  ToString = IIf(vJSON, "True", "False")
              Case Else
                  ToString = CStr(vJSON)
          End Select
      
      End Function
      
      Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
      
          Dim aKeys() As Variant
          Dim i As Long
      
          With oChunks
              Select Case VarType(vElement)
                  Case vbObject
                      If vElement.Count = 0 Then
                          .Item(.Count) = "''"
                      Else
                          .Item(.Count) = vbCrLf
                          aKeys = vElement.Keys
                          For i = 0 To UBound(aKeys)
                              .Item(.Count) = sIndent & aKeys(i) & ": "
                              ToStringElement vElement(aKeys(i)), sIndent & vbTab
                              If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
                          Next
                      End If
                  Case Is >= vbArray
                      If UBound(vElement) = -1 Then
                          .Item(.Count) = "''"
                      Else
                          .Item(.Count) = vbCrLf
                          For i = 0 To UBound(vElement)
                              .Item(.Count) = sIndent & i & ": "
                              ToStringElement vElement(i), sIndent & vbTab
                              If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
                          Next
                      End If
                  Case vbNull
                      .Item(.Count) = "Null"
                  Case vbBoolean
                      .Item(.Count) = IIf(vElement, "True", "False")
                  Case Else
                      .Item(.Count) = CStr(vElement)
              End Select
          End With
      
      End Sub
      
      Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
      
          ' Input:
          ' vJSON - Array or Object which contains rows data
          ' Output:
          ' aData - 2d array representing JSON data
          ' aHeader - 1d array of property names
      
          Dim sName As Variant
      
          Set oHeader = CreateObject("Scripting.Dictionary")
          Select Case VarType(vJSON)
              Case vbObject
                  If vJSON.Count > 0 Then
                      ReDim aData(0 To vJSON.Count - 1, 0 To 0)
                      oHeader("#") = 0
                      i = 0
                      For Each sName In vJSON
                          aData(i, 0) = "#" & sName
                          ToArrayElement vJSON(sName), ""
                          i = i + 1
                      Next
                  Else
                      ReDim aData(0 To 0, 0 To 0)
                  End If
              Case Is >= vbArray
                  If UBound(vJSON) >= 0 Then
                      ReDim aData(0 To UBound(vJSON), 0 To 0)
                      For i = 0 To UBound(vJSON)
                          ToArrayElement vJSON(i), ""
                      Next
                  Else
                      ReDim aData(0 To 0, 0 To 0)
                  End If
              Case Else
                  ReDim aData(0 To 0, 0 To 0)
                  aData(0, 0) = ToString(vJSON)
          End Select
          aHeader = oHeader.Keys()
          Set oHeader = Nothing
          aRows = aData
          Erase aData
      
      End Sub
      
      Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
      
          Dim sName As Variant
          Dim j As Long
      
          Select Case VarType(vElement)
              Case vbObject ' collection of objects
                  For Each sName In vElement
                      ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
                  Next
              Case Is >= vbArray  ' collection of arrays
                  For j = 0 To UBound(vElement)
                      ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
                  Next
              Case Else
                  If Not oHeader.Exists(sFieldName) Then
                      oHeader(sFieldName) = oHeader.Count
                      If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
                  End If
                  j = oHeader(sFieldName)
                  aData(i, j) = ToString(vElement)
          End Select
      
      End Sub