从网页上抓取表格

时间:2020-01-18 21:34:20

标签: excel vba web-scraping

我正在尝试从网页上抓取一张桌子。下面的代码进入 Set hTable = html.querySelector行,但hTable结束时什么都没有。

任何人都可以建议如何正确识别页面上的值表吗?

https://www.morningstar.co.uk/uk/screener/fund.aspx#?filtersSelectedValue=%7B%22analystRatingScale%22:%7B%22id%22:%225%22%7D,%22starRating%22:%7B%22id%22:%225%22%7D%7D&page=1&perPage=10&sortField=legalName&sortOrder=asc&universeId=FOCAN $$ ALL


    Public Sub GetSomeData()
    Const URL As String = "https://www.morningstar.co.uk/uk/screener/fund.aspx#?filtersSelectedValue=%7B%22analystRatingScale%22:%7B%22id%22:%225%22%7D,%22starRating%22:%7B%22id%22:%225%22%7D%7D&page=1&perPage=10&sortField=legalName&sortOrder=asc&universeId=FOCAN$$ALL"
    Dim html As HTMLDocument, hTable As HTMLTable, ws As Worksheet, headers()
    Dim td As Object, tr As Object, r As Long, c As Long

    headers = Array("Tick", "Fund", "1 Day", "1 Week", "1 Month", "3 Months", "6 Months")
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set html = New HTMLDocument
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        html.body.innerHTML = .responseText
    End With
    Set hTable = html.querySelector("ID.ec-screener-results-view-container-section-panel-table-securities")


    r = 1
    With ws
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        For Each tr In hTable.getElementsByTagName("tr")
            r = r + 1: c = 1
            If r > 3 Then
                For Each td In tr.getElementsByTagName("td")
                    .Cells(r - 2, c) = IIf(c = 2, "'" & td.innerText, td.innerText)
                    c = c + 1
                Next
            End If
        Next
    End With
End Sub

1 个答案:

答案 0 :(得分:0)

由于页面上有很多动态元素,因此无法使用 MSXML2.XMLHTTP 从此页面中读取任何数据。唯一的方法是使用IE或Selenium。可以处理JavaScript的环境。

通过保存生成的HTML文档并打开它来自己查看:

Public Sub GetSomeData()
  Const URL As String = "https://www.morningstar.co.uk/uk/screener/fund.aspx#?filtersSelectedValue=%7B%22analystRatingScale%22:%7B%22id%22:%225%22%7D,%22starRating%22:%7B%22id%22:%225%22%7D%7D&page=1&perPage=10&sortField=legalName&sortOrder=asc&universeId=FOCAN$$ALL"
  Dim html As htmlDocument, hTable As HTMLTable, ws As Worksheet, headers()
  Dim td As Object, tr As Object, r As Long, c As Long

  headers = Array("Tick", "Fund", "1 Day", "1 Week", "1 Month", "3 Months", "6 Months")
  Set ws = ThisWorkbook.Worksheets("Sheet1")
  Set html = New htmlDocument
  With CreateObject("MSXML2.XMLHTTP")
    .Open "GET", URL, False
    .send
    html.body.innerHTML = .responseText
  End With

  '-----
  Open "E:\MorningStar.htm" For Output As #1 'Use your own path
  Print #1, html.body.outerHTML
  Close
  '-----
End Sub