我已经在vba中编写了一个脚本,可以从网页中从咖啡店中抓取不同类别的内容。我尝试解析的类别为shopname
,address
和phone
。我已经在脚本中定义了选择器。我面临的问题是我无法将它们存储在字典中以便以后打印。
如果要处理两个项目,我可以按照已经显示的方式来处理它们。当还有其他项目时,我会感到困惑,因为在电话中(当前在下面将其注释掉)开始发挥作用。
如何在字典中存储三个项目并打印出来?
Sub GetDictItems()
Dim key As Variant, Html As New HTMLDocument, URL$, R&
Dim post As HTMLDivElement, shopName$, address$, phone$
Dim idic As Object: Set idic = CreateObject("Scripting.Dictionary")
URL = "https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=San%20Francisco%2C%20CA&page=2"
With New XMLHTTP60
.Open "GET", URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
For Each post In Html.getElementsByClassName("info")
shopName = post.querySelector(".business-name span").innerText
address = post.querySelector(".adr").innerText
' phone = post.querySelector(".phones").innerText
idic(shopName) = address
Next post
For Each key In idic.keys
R = R + 1: Cells(R, 1) = key
Cells(R, 2) = idic(key)
Next key
End Sub
为执行上述脚本而添加的参考:
Microsoft XML, v6.0
Microsoft HTML Object Library
My intention here to learn as to how I can store multiple items in a dictionary in order to print them later.
预期输出:
答案 0 :(得分:3)
另一种可能性是为数据创建简单的类。然后将此类的实例添加到字典中。另外两个类WebData
和InfoDataCollection
将有助于分离代码并提高可读性等。
GetDictItems方法
Const url = "https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=San%20Francisco%2C%20CA&page=2"
Sub GetDictItems()
With New WebData
.Load url
.PrintToExcel
End With
End Sub
WebData类模块
Private m_html As HTMLDocument
Private m_data As InfoDataCollection
Private Sub Class_Initialize()
Set m_html = New HTMLDocument
Set m_data = New InfoDataCollection
End Sub
Public Sub Load(url As String)
With New XMLHTTP60
.Open "GET", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
m_html.body.innerHTML = .responseText
End With
m_data.Add m_html
End Sub
Public Sub PrintToExcel()
Dim key As Variant
Dim R As Long
Dim info As InfoData
For Each key In m_data.Keys
R = R + 1
Set info = m_data.Items(key)
Cells(R, 1) = info.ShopName
Cells(R, 2) = info.Address
Cells(R, 3) = info.Phone
Next key
End Sub
InfoData类模块
Private m_shopName As String
Private m_address As String
Private m_phone As String
Public Property Get ShopName() As String
ShopName = m_shopName
End Property
Public Property Let ShopName(ByVal vNewValue As String)
m_shopName = vNewValue
End Property
Public Property Get Address() As String
Address = m_address
End Property
Public Property Let Address(ByVal vNewValue As String)
m_address = vNewValue
End Property
Public Property Get Phone() As String
Phone = m_phone
End Property
Public Property Let Phone(ByVal vNewValue As String)
m_phone = vNewValue
End Property
InfoDataCollection类模块
Private m_dictionary As Object
Private Sub Class_Initialize()
Set m_dictionary = CreateObject("Scripting.Dictionary")
End Sub
Public Sub Add(html As HTMLDocument)
Dim info As InfoData
Dim post As HTMLDivElement
m_dictionary.RemoveAll
For Each post In html.getElementsByClassName("info")
Set info = New InfoData
info.ShopName = post.querySelector(".business-name span").innerText
info.Address = post.querySelector(".adr").innerText
info.Phone = post.querySelector(".phones").innerText
Set m_dictionary(info.ShopName) = info
Next post
End Sub
Public Property Get Keys() As Variant()
Keys = m_dictionary.Keys
End Property
Public Property Get Items() As Object
Set Items = m_dictionary
End Property
答案 1 :(得分:2)
看来我可以达到如下结果。如果有更好的方法,我将给出答案:
For Each post In Html.getElementsByClassName("info")
shopName = post.querySelector(".business-name span").innerText
address = post.querySelector(".adr").innerText
phone = post.querySelector(".phones").innerText
idic(shopName & "|" & address & "|" & phone) = 1
Next post
For Each key In idic.keys
R = R + 1: Cells(R, 1) = Split(key, "|")(0)
Cells(R, 2) = Split(key, "|")(1)
Cells(R, 3) = Split(key, "|")(2)
Next key
答案 2 :(得分:2)
我喜欢已经给出的答案(+)。您还可以将数组加载到项目中。
For Each post In Html.getElementsByClassName("info")
shopName = post.querySelector(".business-name span").innerText
address = post.querySelector(".adr").innerText
phone = post.querySelector(".phones").innerText
idic(post) = Array(shopName, address, phone)
Next post
For Each key In idic.keys
R = R + 1: ActiveSheet.Cells(R, 1) = idic(key)(0)
ActiveSheet.Cells(R, 2) = idic(key)(1)
ActiveSheet.Cells(R, 3) = idic(key)(2)
Next key
您也可以只使用应该很快的数组。
Dim list As Object, arr(), post As Object, index As Long
Set list = Html.getElementsByClassName("info")
ReDim arr(1 To list.Length)
For Each post In list
index = index + 1
shopName = post.querySelector(".business-name span").innerText
address = post.querySelector(".adr").innerText
phone = post.querySelector(".phones").innerText
arr(index) = Array(shopName, address, phone)
Next
For index = LBound(arr) To UBound(arr)
ActiveSheet.Cells(index, 1).Resize(1, UBound(arr(index))) = arr(index)
Next
但是,我会尝试将html.getElementsByClassName("info")
加载到变量中,并在两种情况下都使用它。
顺便说一句,数据存在于脚本标签内的json字符串中,因此如果使用json解析器,例如jsonconverter.bas也可以:
Dim json As Object, item As Object, results(), i As Long
Set json = JsonConverter.ParseJson(Html.querySelectorAll("script[type='application/ld+json']").item(1).innerHTML)
ReDim results(1 To json.Count)
i = 1
For Each item In json
results(i) = Array(item("name"), Join$(item("address").Items, " ,"), item("telephone"))
i = i + 1
Next