VBA中的HTML抓取 - 多页列表中的所有值

时间:2018-04-13 03:48:45

标签: html vba excel-vba web-scraping excel

我正在尝试收集此网站的所有房产数据: http://taxsales.lgbs.com/

目前有7,000多个属性,但在页面的任何给定视图中,我只能看到15 - 20,具体取决于屏幕分辨率。

我已经大致了解了如何在HTML中搜索属性名称和详细信息。因为初始搜索有警告屏幕,所以我使用以下代码“点击”“同意”按钮以查看后续搜索页面。您还可以看到我不知道如何仅查找属性详细信息,而是(基本上)从整个站点获取所有HTML并稍后在excel中进行筛选。

问题:

1)有没有办法一次查看所有属性的数据? - 或者 - 如何“遍历”结果的每个部分以最终收集所有这些?

2)我如何只收集物业地址,销售日期,销售类型等数据?

<li ng-if="listing.property.sale_date" class="ng-binding ng-scope"><label>Sale Date:</label> 4/5/18 9:00 AM</li>

物业销售日期HTML:

<a ng-click="listing.addressClick()" class="ng-binding"> 02863 Stouton St, Philadelphia PA 19134-3515 </a>

属性地址HTML:

<a href="" ng-click="selectPage(page + 1, $event)" class="ng-binding">Next</a>

下一步按钮HTML:

{{1}}

1 个答案:

答案 0 :(得分:0)

这是一种使用Web请求(xhr)的hacky方法。查看页面请求,它看起来像是返回一个JSON对象。我正在解析这个JSON对象并将其转储到第一个工作表的范围内。这里有一些代码可以帮助您入门。

'You'll need the following references:
'MSXML v6.0
'Microsoft Scripting Runtime
'JSON project from: https://github.com/VBA-tools/VBA-JSON

Public Sub Scraper()
    Dim webrequest      As MSXML2.XMLHTTP60
    Dim JSON            As Object
    Dim responses       As Object
    Dim itemdict        As Variant
    Dim i               As Long
    Dim j               As Long
    Dim k               As Long
    Dim item            As Variant
    Dim myarray         As Variant: ReDim myarray(0 To 20, 0 To 5000)
    Dim url             As String: url = "http://taxsales.lgbs.com/api/property_sales/?in_bbox=-139.04111793750002%2C7.97834134877145%2C-54.40244606250002%2C61.39968867373271&offset=10&ordering=sale_date%2Caddress_full%2Cuid&sale_type=SALE%2CRESALE%2CSTRUCK+OFF%2CFUTURE+SALE"

    For i = 0 To 10 'Do a loop to get SOME of the data, probably need a different loop here
        With New MSXML2.XMLHTTP60
            .Open "GET", url
            .setRequestHeader "accept", "application/json, text/plain, */*"
            .send

            'Parse the response into a JSON dict
            Set JSON = JsonConverter.ParseJson(.responseText)
            url = JSON("next") ' the next URl to send a GET request
            Set responses = JSON("results") 'Get the results Dict

            On Error Resume Next ' getting an error, just ignoring for now

            For Each itemdict In responses
                j = 0

                'add headers
                If k = 0 Then
                    For Each item In itemdict
                        myarray(j, k) = item
                        j = j + 1
                    Next
                End If

                'add values
                For Each item In itemdict
                    myarray(j, k) = itemdict(item)
                    j = j + 1
                Next

                k = k + 1
            Next

            On Error GoTo 0

        End With
    Next

    ReDim Preserve myarray(0 To 20, 0 To k - 1)
    ThisWorkbook.Sheets(1).Range("A1:T" & k - 1).Value = TransposeArray(myarray)

End Sub

'using this function as worksheetfunction.transpose causing issues
Public Function TransposeArray(myarray As Variant) As Variant
    Dim X As Long
    Dim Y As Long
    Dim Xupper As Long
    Dim Yupper As Long
    Dim tempArray As Variant

    Xupper = UBound(myarray, 2)
    Yupper = UBound(myarray, 1)

    ReDim tempArray(Xupper, Yupper)

    For X = 0 To Xupper
        For Y = 0 To Yupper
            tempArray(X, Y) = myarray(Y, X)
        Next
    Next

    TransposeArray = tempArray
End Function

输出:

Example