从网站 (forexfactory.com) 抓取表格

时间:2021-04-06 04:17:40

标签: excel vba internet-explorer web-scraping

需要一些帮助来从forexfactory.com 抓取一个简单的表格。下面的代码有效,但格式非常不稳定,而且那里的列似乎有一个无法正常工作的列。我知道我使用的代码非常新手。乐于改进。

Sub Pulldata2()

 Dim ieObj As InternetExplorer
 Dim appIE As Object
 Dim htmlEle As IHTMLElement
 Dim i As Integer
 Dim strSheet As String
 Dim LastRow As Long
    
 strSheet = Sheet1.Range("A3")
    
       
    Set ieObj = New InternetExplorer
    ieObj.Visible = False
    ieObj.navigate Sheet1.Range("A3").Value
    
    
    Application.ScreenUpdating = False
    Application.DisplayStatusBar = False
    Application.Wait Now + TimeValue("00:00:03")
    Sheet2.Activate
    
        
    For Each htmlEle In ieObj.document.getElementsByClassName("calendar__table")(0).getElementsByTagName("tr")
        LastRow = Cells(Rows.Count, "A").End(xlUp).Row
        i = 1 + LastRow
        With ActiveSheet
        
            .Range("A" & i).Value = htmlEle.Children(1).textContent
            .Range("B" & i).Value = htmlEle.Children(2).textContent
            .Range("C" & i).Value = htmlEle.Children(3).textContent
            .Range("D" & i).Value = htmlEle.Children(4).textContent
            .Range("E" & i).Value = htmlEle.Children(5).textContent
            .Range("F" & i).Value = htmlEle.Children(6).textContent
            .Range("G" & i).Value = htmlEle.Children(7).textContent
            .Range("H" & i).Value = htmlEle.Children(8).textContent
       
          
    End With
    i = i + 1
    On Error Resume Next
Next htmlEle
        


End Sub

目前正在发生的事情: enter image description here

我想要的: enter image description here

enter image description here

1 个答案:

答案 0 :(得分:0)

首先要注意的是,内容存在于视图源中,这意味着它可以被 xmlhttp 请求而无需求助于浏览器的开销。

接下来要注意的是,每个目标列都可以由特定的类作为目标。为了从正确的行开始,还必须指定忽略标题的父类。

由于 css 选择器很长但并不复杂,我通过循环生成它。

我使用 css 选择器返回目标列中的所有 td 节点。我根据要检索的列数使用一点数学循环该列表(请注意,我不是在此 nodeList 中检索日期;我已经在脚本的前面获取了该列表,然后将其添加到占位符的第一列)。这样我只用所需的列重新创建表。


Option Explicit

Public Sub WriteTable()
    'tools > references > Microsoft HTML Object Library
    Dim html As MSHTML.HTMLDocument, xhr As Object
    
    Set xhr = CreateObject("MSXML2.XMLHTTP")
    Set html = New MSHTML.HTMLDocument

    With xhr
        .Open "GET", "https://www.forexfactory.com/", False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim dateText As String
    
    dateText = html.querySelector(".flexDatePicker").Value
    
    Dim css As String, base As String, classSet()
    
    base = ", .calendar__row > "                 'parent class to prefix each child class
    classSet = Array(".calendar__time", ".calendar__currency", ".calendar__event", _
                     ".calendar__actual", ".calendar__forecast", ".calendar__previous") 'classes to match on
    css = ".calendar__row >  " & Join$(classSet, base) 'final selector
    
    Dim tableCells As Object, r As Long, res As Long, c As Long, results()
    
    Set tableCells = html.querySelectorAll(css)
    r = 1
    
    ReDim results(1 To tableCells.Length / 6, 1 To 7) 'calc size of results array based on 6 columns + 1 for date (previously calculated)
    
    results(1, 1) = dateText
    
    For c = 0 To tableCells.Length - 1
        res = c Mod 6
        results(r, res + 2) = tableCells.Item(c).innerText
        If res = 5 Then r = r + 1                'new row in array
    Next
    
    'array fill down time and populate blank date rows
    For r = UBound(results, 1) To LBound(results, 1) + 1 Step -1
        results(r, 2) = IIf(results(r, 2) = vbNullString, results(r - 1, 2), results(r, 2))
        results(r, 1) = dateText
    Next
    
    Dim headers()
    
    headers = Array("Date", "Time", "Currency", "Event", "Actual", "Forecast", "Previous")
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
    
End Sub

阅读:

  1. CSS selectors

更传统的方法可能如下所示(尽管需要重构以减少嵌套级别):

Option Explicit

Public Sub WriteTable()
    'tools > references > Microsoft HTML Object Library
    Dim html As MSHTML.HTMLDocument, xhr As Object
    
    Set xhr = CreateObject("MSXML2.XMLHTTP")
    Set html = New MSHTML.HTMLDocument

    With xhr
        .Open "GET", "https://www.forexfactory.com/", False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim dateText As String
    
    dateText = html.querySelector(".flexDatePicker").Value
    
    Dim table As MSHTML.HTMLTable, r As Long, c As Long, results() As Variant, n As Long
    
    Set table = html.querySelector(".calendar__table")
  
    ReDim results(1 To table.Rows.Length - 4, 1 To 7) 'calc size of results array based on 6 columns + 1 for date (previously calculated)

    results(1, 1) = dateText
    
    Dim i As Long
    
    For r = 4 To table.Rows.Length - 1
        n = 2
        If table.Rows(r).Children.Length = 10 Then
            If table.Rows(r).Children(4).innerText <> vbNullString Then
                i = i + 1
                For c = 1 To table.Rows(r).Children.Length - 1
                    Select Case c
                    Case 1, 2, 4, 6, 7, 8
                        results(i, n) = table.Rows(r).Children(c).innerText
                        n = n + 1
                    End Select
                Next
            End If
        End If
    Next
    
    results = Application.Transpose(results)
    ReDim Preserve results(1 To 7, 1 To i)
    results = Application.Transpose(results)
    
    'array fill down time and populate blank date rows
    For r = UBound(results, 1) To LBound(results, 1) + 1 Step -1
        results(r, 2) = IIf(results(r, 2) = vbNullString, results(r - 1, 2), results(r, 2))
        results(r, 1) = dateText
    Next
    
    Dim headers() As Variant
    
    headers = Array("Date", "Time", "Currency", "Event", "Actual", "Forecast", "Previous")
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
    
End Sub