从网页上抓取表格

时间:2019-05-13 07:49:40

标签: excel vba web-scraping

我想在执行搜索后将表移至Excel中的工作表。我的代码正在访问网页,从工作表中输入值,但无法将表提取到Excel。有什么想法在我当前的代码中有什么问题以及如何使其工作吗?

    Sub GetFerryRatesAutomatic()
    Dim appIE As Object
        Dim tbl, trs, tr, tds, td, r, c

    Set appIE = CreateObject("internetexplorer.application")
    With appIE
        .Navigate "https://laevapiletid.ee/setlang/eng"
        .Visible = True
    End With

    Do While appIE.Busy
        DoEvents
    Loop

    appIE.Document.getElementsByName("trip_outbound")(0).Value = "HEL-TAL"
    appIE.Document.getElementsByName("trip_inbound")(0).Value = "TAL-HEL"

    appIE.Document.getElementsByName("vehicle")(0).Value = "CAR1"

    appIE.Document.getElementsByName("passenger[ADULT]")(0).Value = ThisWorkbook.Sheets("Other Data").Range("F18")

    appIE.Document.getElementsByName("trip_inbound_date")(0).Value = ThisWorkbook.Sheets("Other Data").Range("F20")
    appIE.Document.getElementsByName("trip_outbound_date")(0).Value = ThisWorkbook.Sheets("Other Data").Range("F19")

    appIE.Document.getElementsByClassName("btn btn-lg btn-block btn-primary")(0).Click

'This part is for extracting table

    Set tbl = appIE.Document.getElementsByTagName("travelSelect")(5)
        Set trs = tbl.getElementsByTagName("travels_tableOutbound")

        For r = 0 To trs.Length - 1
            Set tds = trs(r).getElementsByTagName("td")
            If tds.Length = 0 Then Set tds = trs(r).getElementsByTagName("th")

            For c = 0 To tds.Length - 1
                ActiveSheet.Range("A1").Offset(r, c).Value = tds(c).innerText
            Next c
        Next r

    'appIE.Quit
    Set appIE = Nothing

    End Sub 

这是我希望在工作表中包含的网页和表格的HTML:

enter image description here

2 个答案:

答案 0 :(得分:2)

我将使用的重写将包括一个定时循环,以确保在没有表的情况下表有时间加载和退出。使用属性选择器的性能降低,例如,在类选择器上使用属性选择器更具描述性和自我证明,这种情况很小,以至于在这种情况下无关紧要。

'VBE > Tools > References: Microsoft Internet Controls
Public Sub GetPriceInfo()
    Dim ie As New InternetExplorer, url As String, ws As Worksheet
    Dim t As Date, clipboard As Object, hTable As Object
    url = "https://laevapiletid.ee/"
    Const ADULTS As Long = 2
    Const MAX_WAIT_SEC As Long = 10

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")

    With ie
        .Visible = True
        .Navigate2 url

        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document
            .querySelector("[name=trip_outbound] [value='HEL-TAL']").Selected = True
            .querySelector("[name=trip_outbound_date]").Value = "14.05.2019"
            .querySelector("[name=trip_inbound] [value='TAL-HEL']").Selected = True
            .querySelector("[name=trip_inbound_date]").Value = "15.05.2019"
            .querySelector("#adultSpinnerValue").Value = ADULTS
            .querySelector("[name=vehicle] [value='NONE']").Selected = True
            .querySelector("[type=submit]").Click

            t = Timer
            Do
                On Error Resume Next
                Set hTable = .querySelector("#travels_tableOutbound")
                On Error GoTo 0
                If Timer - t > MAX_WAIT_SEC Then Exit Do
            Loop While hTable Is Nothing
        End With

        If InStr(hTable.outerHTML, "Arvutan...") > 0 Then
            t = Timer
            Do
                If Timer - t > MAX_WAIT_SEC Then Exit Do
            Loop Until Not InStr(hTable.outerHTML, "Arvutan...") > 0
            Set hTable = .document.querySelector("#travels_tableOutbound")
        End If

        If hTable Is Nothing Then Exit Sub
        clipboard.SetText hTable.outerHTML
        clipboard.PutInClipboard
        ws.Range("A1").PasteSpecial
        .Quit
    End With
End Sub

答案 1 :(得分:1)

travels_tableOutbound是元素的ID,而不是标签名,因此应使用getElementById代替getElementsByTagName

Set trs = tbl.getElementsByTagName("travels_tableOutbound")

应该是

Set trs = appIE.Document.getElementsByTagName("travels_tableOutbound")


选项2使用查询选择器获取元素:

Set trs = appIE.Document.querySelector("#travels_tableOutbound")