使用VBA从网页中提取表格

时间:2018-09-27 02:44:39

标签: html excel vba web-scraping

我想使用VBA将表格从html代码提取到Excel中。

我尝试了以下代码,多次更改了一些代码,但不断出错。

Sub GrabTable()

    'dimension (set aside memory for) our variables
    Dim objIE As InternetExplorer
    Dim ele As Object
    Dim y As Integer

    'start a new browser instance
    Set objIE = New InternetExplorer
    'make browser visible
    objIE.Visible = False

    'navigate to page with needed data
    objIE.navigate "http://www.bursamalaysia.com/market/listed-companies/company-announcements/5923061"
    'wait for page to load
    Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop

    'we will output data to excel, starting on row 1
    y = 1

    'look at all the 'tr' elements in the 'table' with id 'InputTable2',
    'and evaluate each, one at a time, using 'ele' variable
    For Each ele In objIE.document.getElementByClassName("InputTable2").getElementsByTagName("tr")
        'show the text content of 'td' element being looked at
        Debug.Print ele.textContent
        'each 'tr' (table row) element contains 2 children ('td') elements
        'put text of 1st 'td' in col A
        Sheets("Sheet1").Range("A" & y).Value = ele.Children(0).textContent
        'put text of 2nd 'td' in col B
        Sheets("Sheet1").Range("B" & y).Value = ele.Children(1).textContent

        y = y + 1
    'repeat until last ele has been evaluated
    Next


End Sub

2 个答案:

答案 0 :(得分:1)

我向您展示了两种方法:

  1. 使用IE:数据位于需要协商的iframe中

  2. 使用XMLHTTP请求-更快且无需打开浏览器。它使用iframe文档网址的第一部分,即iframe导航到的内容。

在两种情况下,我都访问包含公司名称的表,然后访问公开信息表。对于公开的主要信息表,我将externalHTML复制到剪贴板,然后粘贴到Excel,以避免循环所有行和列。您可以直接在其中设置循环tr(表行)和td(表单元格)。


IE:

Option Explicit
Public Sub GetInfo()
    Dim IE As New InternetExplorer, clipboard As Object
    With IE
        .Visible = True
        .navigate "http://www.bursamalaysia.com/market/listed-companies/company-announcements/5923061"

        While .Busy Or .readyState < 4: DoEvents: Wend

        Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
        With .document.getElementById("bm_ann_detail_iframe").contentDocument
            ThisWorkbook.Worksheets("Sheet1").Cells(1, 1) = .getElementsByClassName("company_name")(0).innerText
            clipboard.SetText .getElementsByTagName("table")(1).outerHTML
            clipboard.PutInClipboard
        End With

        ThisWorkbook.Worksheets("Sheet1").Cells(2, 1).PasteSpecial
        .Quit
    End With
End Sub

XMLHTTP:

您可以从iframe URL的前端提取另一个URL,并按如下所示使用它。

这是原始HTML的一部分,其中显示了iframe和相关的新网址信息:

enter image description here

Option Explicit
Public Sub GetTable()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object
    Set html = New HTMLDocument
    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "http://disclosure.bursamalaysia.com/FileAccess/viewHtml?e=2891609", False
        .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    html.body.innerHTML = sResponse

    With html
        ThisWorkbook.Worksheets("Sheet1").Cells(1, 1) = .querySelector(".company_name").innerText
        Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
        clipboard.SetText .querySelector(".InputTable2").outerHTML
        clipboard.PutInClipboard
    End With

    ThisWorkbook.Worksheets("Sheet1").Cells(2, 1).PasteSpecial

End Sub

答案 1 :(得分:1)

尝试一下。

Sub Web_Table_Option_Two()
    Dim HTMLDoc As New HTMLDocument
    Dim objTable As Object
    Dim lRow As Long
    Dim lngTable As Long
    Dim lngRow As Long
    Dim lngCol As Long
    Dim ActRw As Long
    Dim objIE As InternetExplorer
    Set objIE = New InternetExplorer
    objIE.Navigate "http://www.bursamalaysia.com/market/listed-companies/company-announcements/5923061"

    Do Until objIE.ReadyState = 4 And Not objIE.Busy
        DoEvents
    Loop
    Application.Wait (Now + TimeValue("0:00:03")) 'wait for java script to load
    HTMLDoc.body.innerHTML = objIE.Document.body.innerHTML
    With HTMLDoc.body
        Set objTable = .getElementsByTagName("table")
        For lngTable = 0 To objTable.Length - 1
            For lngRow = 0 To objTable(lngTable).Rows.Length - 1
                For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                    ThisWorkbook.Sheets("Sheet1").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
                Next lngCol
            Next lngRow
            ActRw = ActRw + objTable(lngTable).Rows.Length + 1
        Next lngTable
    End With
    objIE.Quit
End Sub