我正在VBA for Excel中编写一个用于检查页面的宏,我需要提取在HTML中的脚本中设置的变量。在宏中,我创建了一个IE对象:
Set objIE = CreateObject("InternetExplorer.Application")
目标网站中的页面都有以下脚本,该脚本定义了一个名为digitalData.page.pageName
的变量。这是HTML的片段:
<script>
var digitalData = '';
function init() {
digitalData = {
"user": {
"userLoginState": "guest",
"userCountry": "",
"userEmployeeName": "",
"userBirthday": "",
"userGender": "",
"userState": "",
"userID": "",
"LRUserID": "",
"userEmployeeID": "",
"userDWID": "",
"userSessionId": "BYTEzHFAdLrPoPPOlTPGWvlBjCx54jjEyB8="
},
"page": {
"pageName": "en_us:plp:men:clothing:Casual Shirts",
"pageType": "plp",
"pageGender": "men",
"pageLocale": "us",
"pageRedirected": "no",
"pageJSErrorCount": "3",
"pageLevel1": "men",
"pageLevel2": "men/clothing",
"pageLevel3": "men/clothing/Casual Shirts",
"pageLevel4": "men/clothing/Casual Shirts",
"pageHierarchy": "men/clothing/Casual Shirts"
},
如果我在Chrome中打开此网站的页面并检查它,我可以在控制台中键入变量名称,它将返回值,但我似乎无法使用VBA从IE访问该变量:
inspectLink(i, 1) = objIE.digitalData.page.pageName
在这种情况下,我想在en_us:plp:men:clothing:Casual Shirts
中找到inspectLink(i, 1)
,但我得到Runtime error '438' Object doesn't support the property or method
。
Dim inspectCat(4) As String
inspectCat(0) = "webcat=men"
inspectCat(1) = "webcat=women"
inspectCat(2) = "webcat=kids"
inspectCat(3) = "webcat=baby"
inspectCat(4) = "webcat=home"
Dim targetSearchCount as Integer
Dim failedSearchCount as Integer
targetSearchCount=0
failedSearchCount=0
REM New Code - DOES NOT WORK cannot access pageName this way
REM if digitalData.page.pageName has en_us: in it, then it's our target
REM if it has failedSearchResult in it, then report to web dev team
REM syntax might be objIE.Document.digitalData.page.pageName
REM inspectLink(i, 1) = objIE.digitalData.page.pageName
REM MsgBox inspectLink(i, 1)
REM if inStr(objIE.digitalData.page.pageName, "en_us:") then targetSearchCount=targetSearchCount+1 endif
REM if inStr(objIE.digitalData.page.pageName, "failed_Search_Result") then failedSearchCount=failedSearchCount+1 endif
REM End New Code
REM Begin Old Code - WORKS BUT "dublicate" MAY NOT BE RELIABLE OVER TIME
REM
Set pageNameDubs = objIE.Document.GetElementsByClassName("page-Name-dublicate")
'MsgBox pageNameDubs(0).Value
For Each pageName In pageNameDubs
' If InStr(pageName.innertext, "en_us:") > 0 Then
inspectLink(i, 1) = pageName.Value
' End If
Next
REM End Old Code
答案 0 :(得分:0)
这是一个有趣的。这个答案专门针对访问digitalData.page.pageName
。在下面的代码中,VVV
和^^^
之间的所有内容都基于已加载的文档执行此操作。您可以将该部分集成到现有代码中。
在开发者控制台中,这就像digitalData.page.pageName
一样简单,或者等同于document.defaultView.digitalData.page.pageName
(source)。您可以在Excel VBA中获取document.defaultView
,但我无法弄清楚如何从该对象访问JavaScript全局变量。相反,我是通过DOM完成的。以下示例适用于我。
首先,确保已添加对Microsoft Internet Controls和Microsoft HTML Object Library的引用。
Option Explicit
Option Base 0
Public Sub GetResult()
Dim objIE As SHDocVw.InternetExplorer
Set objIE = New SHDocVw.InternetExplorer
' Load the page with the target data
With objIE
.navigate "http://cxw42.github.io/49290039.html?buster=1"
' Cache buster thanks to https://stackoverflow.com/questions/24851824/how-long-does-it-take-for-github-page-to-show-changes-after-changing-index-html#comment69647442_24871850
' by https://stackoverflow.com/users/185973/joel-glovier
.Visible = True
End With
Do While objIE.Busy
DoEvents
Loop
Dim doc As MSHTML.HTMLDocument
Set doc = objIE.document
' VVVVVVVVVVVVVV
Dim win As MSHTML.HTMLWindow2
Set win = doc.defaultView
' Should be able to directly access win.digitalData, but I can't get that to work.
' Instead, access the data indirectly
Dim uniqueid As String
uniqueid = "id_" & Format(Now, "%yyyy%mm%dd%hh%nn%ss")
Dim code As String
code = "(function(){var x = document.createElement('p'); x.id='" & uniqueid & "'; x.innerText=digitalData.page.pageName; document.body.appendChild(x); })()"
' Copy digitalData.page.pageName into the DOM
win.execScript code, "JavaScript"
Dim pageName As String
Dim node
Set node = doc.getElementById(uniqueid) ' Get the new DOM node
pageName = node.innerText
' Clean up
doc.getElementsByTagName("body").Item(0).RemoveChild node
' ^^^^^^^^^^^^^^
' Now do whatever you want with pageName.
Debug.Print pageName
End Sub
魔法在code
和win.execScript
。 code
是一个JavaScript oneliner,可以创建一个新的<p>
元素并将digitalData.page.pageName
复制到其中。 win.execScript
在页面上下文中运行JavaScript,从而创建新节点。该节点有一个uniqueid
(好的,可能是唯一的),我们可以在它创建后用它来找到它。然后我们可以将结果从该段的innerText
中拉出来。