从网站上的表格中抓取数据,而无需搜索标签

时间:2018-10-05 22:55:05

标签: web-scraping access-vba

这是该问题的延续 using InStr to search for quotes, spaces, colons, etc

我也在尝试在下面获取所有这些数据。我本来打算搜索<td align="left">的循环 但我有一种感觉,它将带来大量垃圾以及所需的结果。我想知道是否还有更好的方法。

<b>Total Hospital Beds</b></td> 
                                        <td align="left">Adult ICU (intensive care unit) CCU (critical care unit)</td>
                                        <td align="left">26</td>
                                        <td align="left">Medical/surgical</td>
                                        <td align="left">198</td>
                                        <td align="left">Pediatric</td>
                                        <td align="left">20</td>

                                        <td align="center" colspan="2"><b>Services</b></td>
                                        <td align="left">Acute Hemodialoysis Service</td>
                                        <td align="left">Chronic Hemodialysis Stations</td>
                                        <td align="left">Magnetic Resonance Imaging - On Site</td>
                                        <td align="left">Mixed OR's</td>
                                        <td align="left">7</td>

基本上,所有内容都在底部的部分中,并附带了屏幕截图 enter image description here

这就是我所拥有的,但是现在它在第一页之后停止,加粗的行突出显示了

Public Sub VisitPages()
    Dim IE As New InternetExplorer
    With IE
        .Visible = True
        .navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"

        While .Busy Or .ReadyState < 4: DoEvents: Wend

        With .Document
            .querySelector("#middleContent_cbType_1").Click
            .querySelector("#middleContent_cbType_4").Click
            .querySelector("#middleContent_btnGetList").Click
        End With

        While .Busy Or .ReadyState < 4: DoEvents: Wend

        Dim list As Object, i  As Long
        Set list = .Document.querySelectorAll("#main_table [href*=doPostBack]")

        For i = 0 To list.Length - 1

            list.Item(i).Click

            While .Busy Or .ReadyState < 4: DoEvents: Wend

            ' Application.Wait Now + TimeSerial(0, 0, 3) '<== Delete me later. This is just to demo page changes
            **Debug.Print .Document.getElementById("middleContent_lbName_county").outerHTML**
            'do stuff with new page

            Dim FirstOcc As Long
            Dim TtlHosp As Variant
            Dim FLine As Variant
            Dim FLineFixed As Variant

            TtlHosp = Mid(.Document.Body.innerHTML, InStr(.Document.Body.innerHTML, "Total Hospital Beds"), 4000)

            Do Until InStr(TtlHosp, "<td align=" & Chr(34) & "left" & Chr(34) & ">") = 0

                FirstOcc = InStr(TtlHosp, "<td align=" & Chr(34) & "left" & Chr(34) & ">")
                FLine = Mid(TtlHosp, FirstOcc + 17, 150)
                FLineFixed = Mid(FLine, 1, InStr(FLine, "</td>") - 1)
                Debug.Print FLineFixed
                TtlHosp = Mid(TtlHosp, FirstOcc + 17, 2000)

            Loop

            .Navigate2 .Document.URL             '<== back to homepage
            While .Busy Or .ReadyState < 4: DoEvents: Wend
            Set list = .Document.querySelectorAll("#main_table [href*=doPostBack]") 'reset list (often required in these scenarios)
        Next
        Stop                                     '<== Delete me later
        '.Quit '<== Remember to quit application
    End With
End Sub

1 个答案:

答案 0 :(得分:2)

该信息似乎仅用于特殊医院的选择,而床位信息在第三张表中。 (注意:结果当前是用Excel编写的)

Option Explicit
Public Sub VisitPages()
    Dim IE As New InternetExplorer, ws As Worksheet
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    With IE
        .Visible = True
        .navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"

        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document
            .querySelector("#middleContent_cbType_5").Click
            .querySelector("#middleContent_btnGetList").Click
        End With

        While .Busy Or .readyState < 4: DoEvents: Wend

        Dim list As Object, i  As Long
        Set list = .document.querySelectorAll("#main_table [href*=doPostBack]")
        For i = 0 To list.Length - 1
            list.item(i).Click

            While .Busy Or .readyState < 4: DoEvents: Wend

            WriteTable .document.getElementsByTagName("table")(3), .document.getElementById("middleContent_lbName_county").innerText, GetLastRow(ws, 3) + 1, ws
            'do stuff with new page
            .Navigate2 .document.URL             '<== back to homepage
            While .Busy Or .readyState < 4: DoEvents: Wend
            Set list = .document.querySelectorAll("#main_table [href*=doPostBack]") 'reset list (often required in these scenarios)
        Next
        .Quit                                    '<== Remember to quit application
    End With
End Sub

Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
    With ws
        GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
    End With
End Function

Public Sub WriteTable(ByVal hTable As HTMLTable, facility As String, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
    If ws Is Nothing Then Set ws = ActiveSheet

    Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long, titleRow As Long
    r = startRow: titleRow = startRow
    With ws
        Set tRow = hTable.getElementsByTagName("tr")
        .Cells(titleRow, 1) = facility
        For Each tr In tRow
            r = r + 1
            Set tCell = tr.getElementsByTagName("td")
            c = 2
            For Each td In tCell
                .Cells(r, c).Value = td.innerText
                c = c + 1
            Next td
        Next tr
    End With
End Sub