我已经从Web上从Web抓取了数据,但是我的程序只在一个页面上工作,现在,我已经创建了一个外循环,这样程序就可以从链接的所有页面获取相似的数据集,从而给出错误,即结束有没有。由于我是vba的新手,因此需要任何专家帮助来解决此问题。 这是我的代码:
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim col As Variant, icol As New Collection
Dim csrf As Variant, i&
Dim s As String, re As Object, p As String, page As Long, rx As String 'Variable Definations
Const START_PAGE As Long = 1
Const END_PAGE As Long = 4
Const RESULTS_PER_PAGE As Long = 40
p = "\[{""@context"".*?\]"
Set re = CreateObject("VBScript.RegExp")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/2620/10/1", False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
icol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, headers(), results(), ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name of VGO/NGO", "Address", "City", "State", "Tel", "Mobile", "Web", "Email")
ReDim results(1 To icol.Count, 1 To UBound(headers) + 1)
For Each col In icol
r = r + 1
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, so As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
city = json("registeration_info")(1)("nr_city")
srNo = r '<unsure where this is coming from.
state = Replace$(json("registeration_info")(1)("StateName"), "amp;", vbNullString)
tel = IIf(IsNull(json("infor")("0")("Off_phone1")), vbNullString, json("infor")("0")("Off_phone1")) '<unsure where this is coming from. Need a csrf to test with
mobile = json("infor")("0")("Mobile")
website = json("infor")("0")("ngo_url")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, city, state, tel, mobile, website, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End With
End Sub
答案 0 :(得分:2)
您需要在页面上进行外部循环,并将页码连接到url中。
r
需要在每个新页面的开始处重置为0。
每次写出当前页面的数组时,都需要找到最后使用的行,而与列无关(或者在开始时标注一个巨大的数组并填充该数组-然后在表中只写一次)。
删除自动实例。
我看到输出中看起来像重复的信息,所以值得研究有关此的数据源。
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
Dim headers(), ws As Worksheet, iCol As Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name of VGO/NGO", "Address", "City", "State", "Tel", "Mobile", "Web", "Email")
Set Http = New XMLHTTP60
Set Html = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For page = 1 To 4
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/" & CStr(page), False
.send
Html.body.innerHTML = .responseText
End With
Set iCol = New Collection
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
iCol.Add Split(Split(.item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, results()
ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
r = 0
For Each col In iCol
r = r + 1
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
city = json("registeration_info")(1)("nr_city")
srNo = r '<unsure where this is coming from.
state = Replace$(json("registeration_info")(1)("StateName"), "amp;", vbNullString)
tel = IIf(IsNull(json("infor")("0")("Off_phone1")), vbNullString, json("infor")("0")("Off_phone1")) '<unsure where this is coming from. Need a csrf to test with
mobile = json("infor")("0")("Mobile")
website = json("infor")("0")("ngo_url")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, city, state, tel, mobile, website, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
Set iCol = Nothing: Set json = Nothing
ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
Next
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function