VBA-JSON从一个表中的所有页面导入数据

时间:2019-06-14 19:59:57

标签: json excel vba web-scraping

我已经从Web上从Web抓取了数据,但是我的程序只在一个页面上工作,现在,我已经创建了一个外循环,这样程序就可以从链接的所有页面获取相似的数据集,从而给出错误,即结束有没有。由于我是vba的新手,因此需要任何专家帮助来解决此问题。 这是我的代码:

Option Explicit

Public Sub FetchTabularInfo()
    Dim Http As New XMLHTTP60, Html As New HTMLDocument
    Dim col As Variant, icol As New Collection
    Dim csrf As Variant, i&
    Dim s As String, re As Object, p As String, page As Long, rx As String 'Variable Definations
    Const START_PAGE As Long = 1
    Const END_PAGE As Long = 4
    Const RESULTS_PER_PAGE As Long = 40

p = "\[{""@context"".*?\]"
    Set re = CreateObject("VBScript.RegExp")
    Application.ScreenUpdating = False
    With CreateObject("MSXML2.XMLHTTP")
        For page = START_PAGE To END_PAGE

    With Http
        .Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/2620/10/1", False
        .send
        Html.body.innerHTML = .responseText
    End With

    With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
        For i = 0 To .Length - 1
            icol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
        Next i
    End With

    Dim r As Long, headers(), results(), ws As Worksheet

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    headers = Array("SrNo", "Name of VGO/NGO", "Address", "City", "State", "Tel", "Mobile", "Web", "Email")
    ReDim results(1 To icol.Count, 1 To UBound(headers) + 1)

    For Each col In icol
        r = r + 1
        With Http
            .Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
            .send
            csrf = .responseText
        End With

        csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)

        Dim json As Object
        With Http
            .Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
            .setRequestHeader "X-Requested-With", "XMLHttpRequest"
            .setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
            .send "id=" & col & "&csrf_test_name=" & csrf
            Set json = JsonConverter.ParseJson(.responseText)

            Dim orgName As String, address As String, so As Long, city As String
            Dim state As String, tel As String, mobile As String, website As String, email As String

            On Error Resume Next
            orgName = json("registeration_info")(1)("nr_orgName")
            address = json("registeration_info")(1)("nr_add")
            city = json("registeration_info")(1)("nr_city")
            srNo = r '<unsure where this is coming from.
            state = Replace$(json("registeration_info")(1)("StateName"), "amp;", vbNullString)
            tel = IIf(IsNull(json("infor")("0")("Off_phone1")), vbNullString, json("infor")("0")("Off_phone1")) '<unsure where this is coming from. Need a csrf to test with
            mobile = json("infor")("0")("Mobile")
            website = json("infor")("0")("ngo_url")
            email = json("infor")("0")("Email")
            On Error GoTo 0

            Dim arr()
            arr = Array(srNo, orgName, address, city, state, tel, mobile, website, email)
            For i = LBound(headers) To UBound(headers)
               results(r, i + 1) = arr(i)
            Next
        End With
    Next col
    With ws
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With

End With

End Sub

1 个答案:

答案 0 :(得分:2)

您需要在页面上进行外部循环,并将页码连接到url中。

r需要在每个新页面的开始处重置为0。

每次写出当前页面的数组时,都需要找到最后使用的行,而与列无关(或者在开始时标注一个巨大的数组并填充该数组-然后在表中只写一次)。

删除自动实例。

我看到输出中看起来像重复的信息,所以值得研究有关此的数据源。

Option Explicit

Public Sub FetchTabularInfo()
    Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
    Dim headers(), ws As Worksheet, iCol As Collection

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    headers = Array("SrNo", "Name of VGO/NGO", "Address", "City", "State", "Tel", "Mobile", "Web", "Email")
    Set Http = New XMLHTTP60
    Set Html = New HTMLDocument

    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

    For page = 1 To 4

        With Http
            .Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/" & CStr(page), False
            .send
            Html.body.innerHTML = .responseText
        End With
        Set iCol = New Collection
        With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
            For i = 0 To .Length - 1
                iCol.Add Split(Split(.item(i).getAttribute("onclick"), "(""")(1), """)")(0)
            Next i
        End With

        Dim r As Long, results()
        ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
        r = 0
        For Each col In iCol
            r = r + 1
            With Http
                .Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
                .send
                csrf = .responseText
            End With

            csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)

            Dim json As Object
            With Http
                .Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
                .setRequestHeader "X-Requested-With", "XMLHttpRequest"
                .setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
                .send "id=" & col & "&csrf_test_name=" & csrf
                Set json = JsonConverter.ParseJson(.responseText)

                Dim orgName As String, address As String, srNo As Long, city As String
                Dim state As String, tel As String, mobile As String, website As String, email As String

                On Error Resume Next
                orgName = json("registeration_info")(1)("nr_orgName")
                address = json("registeration_info")(1)("nr_add")
                city = json("registeration_info")(1)("nr_city")
                srNo = r                         '<unsure where this is coming from.
                state = Replace$(json("registeration_info")(1)("StateName"), "amp;", vbNullString)
                tel = IIf(IsNull(json("infor")("0")("Off_phone1")), vbNullString, json("infor")("0")("Off_phone1")) '<unsure where this is coming from. Need a csrf to test with
                mobile = json("infor")("0")("Mobile")
                website = json("infor")("0")("ngo_url")
                email = json("infor")("0")("Email")
                On Error GoTo 0

                Dim arr()
                arr = Array(srNo, orgName, address, city, state, tel, mobile, website, email)
                For i = LBound(headers) To UBound(headers)
                    results(r, i + 1) = arr(i)
                Next
            End With
        Next col
        Set iCol = Nothing: Set json = Nothing
        ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    Next
End Sub

Public Function GetLastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    GetLastRow = sh.Cells.Find(What:="*", _
                               After:=sh.Range("A1"), _
                               Lookat:=xlPart, _
                               LookIn:=xlFormulas, _
                               SearchOrder:=xlByRows, _
                               SearchDirection:=xlPrevious, _
                               MatchCase:=False).Row
    On Error GoTo 0
End Function