从网页上获取数据,将其解析为特定片段,然后显示出来

时间:2019-06-30 07:38:19

标签: excel vba web-scraping

我已经开发了用于删除网页“ yelp”的代码。该代码在某些页面上运行良好,但现在可以投放广告,重复一些条目,而且不能获取页面范围内定义的所有结果。 这是我的代码:

Sub GetInfo()
    Const URL$ = "https://www.yelp.com/search?find_desc=Plumbing&find_loc=Washington%2C%20DC"
    Const base$ = "https://www.yelp.com"
    Dim Http As New XMLHTTP60, Html As New HTMLDocument
    Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
    Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object

    [A1:D1] = [{"Name","Phone","Address","Website"}]

    For page = 0 To 1   'this is where you change the last number for this script to traverse
        With Http
            .Open "GET", URL & page * 30, False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .send
            Html.body.innerHTML = .responseText
        End With

        With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
            For I = 0 To .Length - 1
                If Not InStr(.Item(I).getAttribute("href"), "/adredir?") > 0 Then
                    oTitle = .Item(I).innerText
                    newUrl = Replace(.Item(I).getAttribute("href"), "about:", base)
                    With Http
                        .Open "GET", newUrl, False
                        .setRequestHeader "User-Agent", "Mozilla/5.0"
                        .send
                        Htmldoc.body.innerHTML = .responseText
                    End With

                    R = R + 1: Cells(R + 1, 1) = oTitle

                    Set oPhone = Htmldoc.querySelector(".biz-phone")
                    If Not oPhone Is Nothing Then
                        Cells(R + 1, 2) = oPhone.innerText
                    End If

                    Set oAddress = Htmldoc.querySelector(".map-box-address")
                    If Not oAddress Is Nothing Then
                        Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
                    End If

                    Set oWeb = Htmldoc.querySelector(".biz-website > a")
                    If Not oWeb Is Nothing Then
                        Cells(R + 1, 4) = oWeb.innerText
                    End If
                End If
            Next I
        End With
    Next page
End Sub

0 个答案:

没有答案