使用JSON进行网页抓取

时间:2019-06-20 11:01:37

标签: json excel vba web-scraping

我已经开发了代码,可以将名称,地址,网站,联系方式从页面抓取到excel工作表中。因此,由于我不熟悉JSON和Web抓取,因此无法解决此问题。第一页链接是   https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA;但是此代码无法正常运行。而且,我在其他网站上使用了相同的代码,然后运行良好。这是我的代码

Option Explicit

Public Sub FetchTabularInfo()
    Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
    Dim headers(), ws As Worksheet, iCol As Collection

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
    Set Http = New XMLHTTP60
    Set Html = New HTMLDocument

    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

    For page = 1 To 73 'To cover all pages

        With Http
            .Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
            .send
            Html.body.innerHTML = .responseText
        End With
        Set iCol = New Collection
        With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
            For i = 0 To .Length - 1
                iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
            Next i
        End With

        Dim r As Long, results()
        ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
        r = 0
        For Each col In iCol
            r = r + 1
            With Http
                .Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
                .send
                csrf = .responseText
            End With

            csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)

            Dim json As Object
            With Http
                .Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
                .setRequestHeader "X-Requested-With", "XMLHttpRequest"
                .setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
                .send "id=" & col & "&csrf_test_name=" & csrf
                Set json = JsonConverter.ParseJson(.responseText)

                Dim orgName As String, address As String, srNo As Long, city As String
                Dim state As String, tel As String, mobile As String, website As String, email As String

                On Error Resume Next
                orgName = json("registeration_info")(1)("nr_orgName")
                address = json("registeration_info")(1)("nr_add")

                srNo = r                         '<unsure where this is coming from.

                mobile = json("infor")("0")("Mobile")

                email = json("infor")("0")("Email")
                On Error GoTo 0

                Dim arr()
                arr = Array(srNo, orgName, address, tel, email)
                For i = LBound(headers) To UBound(headers)
                    results(r, i + 1) = arr(i)
                Next
            End With
        Next col
        Set iCol = Nothing: Set json = Nothing
        ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    Next
End Sub

Public Function GetLastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    GetLastRow = sh.Cells.Find(What:="*", _
                               After:=sh.Range("A1"), _
                               Lookat:=xlPart, _
                               LookIn:=xlFormulas, _
                               SearchOrder:=xlByRows, _
                               SearchDirection:=xlPrevious, _
                               MatchCase:=False).Row
    On Error GoTo 0
End Function

0 个答案:

没有答案