我已经开发了用于从网站抓取数据的代码,但是由于我对JSON的了解很少,因此我可以根据需要获取输出,因此该代码是针对以下Web开发的:https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1将我的代码复制到其他具有json的网站(例如此网站):https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA;但是此代码无法正常运行。这是我的代码(我希望它对大多数网站都是通用的)
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
Dim headers(), ws As Worksheet, iCol As Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
Set Http = New XMLHTTP60
Set Html = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For page = 1 To 8 'To cover all pages
With Http
.Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
.send
Html.body.innerHTML = .responseText
End With
Set iCol = New Collection
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, results()
ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
r = 0
For Each col In iCol
r = r + 1
With Http
.Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
srNo = r '<unsure where this is coming from.
mobile = json("infor")("0")("Mobile")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, tel, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
Set iCol = Nothing: Set json = Nothing
ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
Next
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
也请让我知道我在做的错误,以便将来解决。
答案 0 :(得分:1)
简短答案:
不。
我想说的是,您不可能为大多数网站编写通用的 。可以说通用部分是解析器本身。但是您需要熟悉每个端点的json结构,以适当地进行直接解析。 Json本身已经定义了结构语法/组件,但是您想要从这些结构/组件中获得什么将具有不同的访问路径,并且需要不同的处理方式。然后可能需要提供参数以及输出格式的差异。
什么是最佳方案?
如果您有一组url(理想的API端点)列表,则您有更好的机会编写可能会持续一段时间的内容,因为您可以熟悉返回的json。但是这有多通用?它实际上只是分支代码。
可重复使用的内容:
可能是非解析器的东西,可以泛化为例如您创建的方法和类可以解析整个结构的路径并查找关键字并返回这些路径?您编写的帮助程序功能可能会递归地编写嵌套结构等。使请求和处理失败的代码等。...我绝对建议您在网络抓取中研究可重用代码的类。
基于类的示例:
我会在何时添加该内容