VBA-不含ID的报废HTML表

时间:2018-07-19 16:56:01

标签: html vba web-scraping

我正在尝试使用VBA从html表中获取数据。从列表框中选择一个值后,填充一个文本框并单击一个按钮,表格就会出现。但是该网站的网址不会更改。

我的程序确实填充了该框,选择了列表框的值,然后单击“搜索”按钮,但是我无法从表中获取数据。

我需要页面末尾的表格单元格的值。 (第二

这是页面的url

代码:

Sub Info()

Dim enlace As String
Dim id As String
Dim lista
Dim rut As Integer
Dim i As Integer
Dim largo As Integer

largo = Worksheets("Lista").Cells(rows.Count, 1).End(xlUp).Row

id = Worksheets("Lista").Cells(2, 1).Value
lista = Split(id, "-")
rut = lista(0)
enlace = "http://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=" & rut & "&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw+cAAhAABP4MAAz&control=svs&pestania=1"

Set objIE = CreateObject("InternetExplorer.application")
objIE.Visible = False
objIE.Navigate (enlace)
Do
    If objIE.ReadyState = 4 Then
        objIE.Visible = False
        Exit Do
    Else
        DoEvents
        End If
Loop

Dim button_name As String
button_name = "Aportantes"

Set link = objIE.document.getElementsByTagName("A")
For Each Hyperlink In link
If InStr(Hyperlink.innerText, button_name) > 0 Then
    Hyperlink.Click
Exit For
End If
Next

Dim nuevoLink As String
nuevoLink = Hyperlink

objIE.Quit

Set ie = CreateObject("InternetExplorer.application")
ie.Visible = False
ie.Navigate (nuevoLink)
Do
    If ie.ReadyState = 4 Then
        ie.Visible = False
        Exit Do
    Else
        DoEvents
        End If
Loop

Dim sem As String
Dim ano As Integer
sem = "03"
ano = 2018

Dim aportantes As Object
Dim cuotas_emitidas As Object

ie.document.getElementById("semestre").Value = sem
ie.document.getElementById("aa").Value = ano
Set elems = ie.document.getElementsByTagName("input")
For Each e In elems
If (e.getAttribute("value") = "Consultar") Then
    e.Click
    ''HERE IS THE PROBLEM
    Set aportantes = ie.document.getElementsByTagName("table")(1).getElementsByTagName("tr")(0).getElementsByTagName("tr")(1)
    ThisWorkbook.Worksheets("Lista").Cells(i, 4).Value = aportantes
    Set cuotas_emitidas = ie.document.getElementsByTagName("table")(1).getElementsByTagName("tr")(1).getElementsByTagName("tr")(1).innerText
    ThisWorkbook.Worksheets("Lista").Cells(i, 5).Value = cuotas_emitidas
End If
Next e
End Sub

HTML:

<table>
 <tbody>
    <tr>
    <td class="fondoOscuro">2.01.60 TOTAL APORTANTES</td>
    <td>58</td>
  </tr>

  <tr>
    <td class="fondoOscuro">2.01.70 CUOTAS EMITIDAS</td>
    <td>20000000 </td>
  </tr>
  <tr>
    <td class="fondoOscuro">2.01.71 CUOTAS PAGADAS</td>
    <td>7691000</td>

  </tr>
  <tr>
    <td class="fondoOscuro">2.01.72 CUOTAS SUSCRITAS Y NO PAGADAS</td>
    <td>0 </td>
  </tr>
  <tr>
    <td class="fondoOscuro">2.01.73 NUMERO DE CUOTAS CON PROMESA DE SUSCRIPCION Y PAGO</td>
    <td>0  </td>
  </tr>
  <tr>
    <td class="fondoOscuro">2.01.74 NUMERO DE CONTRATOS DE PROMESAS DE SUSCRIPCION Y PAGO</td>
    <td>0</td>
  </tr>
  <tr>
    <td class="fondoOscuro">2.01.75 NUMERO DE PROMITENTES SUSCRIPTORES DE CUOTAS</td>
    <td>0 </td>
  </tr>
  <tr>
    <td class="fondoOscuro">2.01.80 VALOR LIBRO DE LA CUOTA</td>
    <td>1.0059 </td>
  </tr>
</tbody></table>

'

2 个答案:

答案 0 :(得分:1)

XHR:

您可以使用XHR进行整个操作,而无需打开浏览器即可进行抓取。将Activesheet输出更改为要将表写入到的工作表(WriteTable hTable, 1, ActiveSheet)。

请注意POST正文的参数包括:

  1. mm = 12#个月
  2. aa = 2017年
  3. rut = 9278 rut代码

代码:

Public Sub GetTable()
    Dim sResponse As String, hTable As Object, id As String, lista() As String, rut As String
    Dim strBody As String
    id = Worksheets("Lista").Cells(2, 1).Value
    lista = Split(id, "-")
    rut = lista(0)

    strBody = "mm=12&aa=2017&rut=" & rut
    With CreateObject("MSXML2.XMLHTTP")
        .Open "POST", "http://www.cmfchile.cl//institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=9278&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw%20cAAhAABP4MAAz&control=svs&pestania=27", False
        .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
        .send strBody
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))

    With CreateObject("htmlFile")
        .Write sResponse
        Set hTable = .getElementsByTagName("table")(1)
    End With
    Application.ScreenUpdating = False
    WriteTable hTable, 1, ActiveSheet
    Application.ScreenUpdating = True
End Sub

Public Sub WriteTable(ByVal hTable As Object, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
    If ws Is Nothing Then Set ws = ActiveSheet

    Dim tSection As Object, tRow As Object, tCell As Object, tr As Object, td As Object, R As Long, C As Long, tBody As Object
    R = startRow
    With ws
        Set tBody = hTable.getElementsByTagName("tbody")
        For Each tSection In tBody               'HTMLTableSection
            Set tRow = tSection.getElementsByTagName("tr") 'HTMLTableRow
            For Each tr In tRow
                R = R + 1
                Set tCell = tr.getElementsByTagName("td")
                C = 1
                For Each td In tCell             'DispHTMLElementCollection
                    .Cells(R, C).Value = td.innerText 'HTMLTableCell
                    C = C + 1
                Next td
            Next tr
        Next tSection
    End With
End Sub

使用浏览器(也使用上面的WriteTable子控件)

Option Explicit
Public Sub GetInfo()
    Dim ie As New InternetExplorer, hTable As HTMLTable
    Application.ScreenUpdating = False
    With ie
        .Visible = True
        .navigate "http://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=9278&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw%20cAAhAABP4MAAz&control=svs&pestania=27"
        While .Busy Or .readyState < 4: DoEvents: Wend
        .document.getElementById("aa").Value = 2017
        .document.forms("consulta").submit
        Do
            DoEvents
            On Error Resume Next
            Set hTable = .document.getElementsByTagName("table")(1)
            On Error GoTo 0
        Loop While hTable Is Nothing

        WriteTable hTable, 1, ActiveSheet
    End With
    Application.ScreenUpdating = True
End Sub

输出:

output


参考:

通过VBE>工具>引用的HTML对象库


调整代码轮廓,但仍使用pestania = 27

Option Explicit
Public Sub GetInfo()
    Dim ie As New InternetExplorer, hTable As HTMLTable, lista() As String, id As String, rut As String, enlace As String
    Application.ScreenUpdating = False

    id = Worksheets("Lista").Cells(2, 1).Value
    lista = Split(id, "-")
    rut = lista(0)
    enlace = "http://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=" & rut & "&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw+cAAhAABP4MAAz&control=svs&pestania=27"

    With ie
        .Visible = True
        .navigate enlace  '"http://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=9278&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw%20cAAhAABP4MAAz&control=svs&pestania=27"
        While .Busy Or .readyState < 4: DoEvents: Wend
        .document.getElementById("aa").Value = 2017
        .document.forms("consulta").submit
        Do
            DoEvents
            On Error Resume Next
            Set hTable = .document.getElementsByTagName("table")(1)
            On Error GoTo 0
        Loop While hTable Is Nothing

        WriteTable hTable, 1, ActiveSheet
    End With
    Application.ScreenUpdating = True
End Sub

答案 1 :(得分:1)

您已经有了不错的答案。问题是QHarr决定参加比赛时,他几乎没有任何选择让其他人站出来。但是,以下脚本将为您节省一些额外时间。我使用IE获取了page source,然后应用了更快的方法来管理其余部分。我试图解析针对2016年填充的相关表格数据。可以根据需要随意更改年份。

Sub ScrapeTabularInfo()
    Dim IE As New InternetExplorer, Html As HTMLDocument
    Dim Htmldoc As New HTMLDocument, post As Object, elem As Object
    Dim trow As Object, R&, C&

    With IE
        .Visible = False
        .navigate "http://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=V&rut=9278&grupo=&tipoentidad=FINRE&vig=VI&row=AAAw%20cAAhAABP4MAAz&control=svs&pestania=27"
        While .Busy Or .readyState < 4: DoEvents: Wend
        Set Html = .document
        Html.querySelector("#aa").innerText = 2016
        Html.querySelector("input[value='Consultar']").Click
        Do: Set post = Html.getElementsByTagName("table")(1): DoEvents: Loop While post Is Nothing
    End With

    Htmldoc.body.innerHTML = Html.DocumentElement.outerHTML

    For Each elem In Htmldoc.getElementsByTagName("table")(1).Rows
        For Each trow In elem.Cells
            C = C + 1: Cells(R + 1, C) = trow.innerText
        Next trow
        C = 0: R = R + 1
    Next elem
    IE.Quit
End Sub

这里最好的方法是利用post请求,您已经有一个演示。

添加到库中的参考(考虑到您拥有IE9或更高版本才能使.querySelector()正常工作):

Microsoft Internet Controls
Microsoft HTML Object Libray