我正在尝试使用xmlhttp
请求从网页中获取部分信息。当我执行脚本时,它将引发错误Object Variable Or With---
。但是,当我使用IE
尝试相同操作时,我得到的内容就像魔术。
最要注意的是,我希望抓取的内容既不是JavaScript加密的也不是动态生成的。因此,我应该使用xhr
来获得它们。我要去哪里错了?
使用IE
(有效):
Sub GetText()
Const Url As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim IE As New InternetExplorer, HTML As HTMLDocument, post As Object
With IE
.Visible = False
.navigate Url
While .Busy = True Or .readyState < 4: DoEvents: Wend
Set HTML = .document
End With
Set post = HTML.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub
使用XHR(不起作用):
Sub GetText()
Const Url As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim Http As New XMLHTTP60, HTML As New HTMLDocument, post As Object
With Http
.Open "GET", Url, False
.send
HTML.body.innerHTML = .responseText
End With
Set post = HTML.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub
上面定义的selector
完美无缺。
我本可以将相关的html elements
粘贴在这里,但是它们被包裹在comments
中。但是,我在上方提供了该站点的链接。
更清楚一点:我感兴趣的文本部分与该网页中的内容完全相同。
我的问题:如何使用XHR获得上述文本块(如上图所示)?
答案 0 :(得分:3)
The solution is plain and simple. All you need to do is kick out the comment signs
from responseText
using Replace()
function or so and then filter them using Html.body.innerHTML
to make them proper html contents
. The rest is as usual.
This is how you can get the content:
Sub GetTextFromComment()
Const URL As String = "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml"
Dim Http As New XMLHTTP60, Html As New HTMLDocument, post As Object
With Http
.Open "GET", URL, False
.send
Html.body.innerHTML = Replace(Replace(.responseText, "<!--", ""), "-->", "")
End With
Set post = Html.querySelectorAll(".section_content")(2)
MsgBox post.innerText
End Sub
答案 1 :(得分:1)
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
html.body.innerHTML = html.querySelector("#all_9711922514").LastChild.Data
Debug.Print html.querySelector("#div_9711922514").innerText
End With
End Sub
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument, ele As Object
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
For Each ele In html.querySelector("#all_9711922514").Children
If ele.NodeType = 8 Then
html.body.innerHTML = ele.Data
Debug.Print html.querySelector("#div_9711922514").innerText
Exit For
End If
Next
End With
End Sub
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.baseball-reference.com/boxes/ANA/ANA201806180.shtml", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
Dim s As String
s = .querySelector("div[id=all_1786105919]").outerHTML
s = regexRemove(s, "<([^>]+)>")
Debug.Print Replace$(Replace$(s, "&", "°"), "-->", vbNullString)
End With
End Sub
Public Function regexRemove(ByVal s As String, ByVal pattern As String) As String
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
End With
If regex.test(s) Then
regexRemove = regex.Replace(s, vbNullString)
Else
regexRemove = s
End If
End Function
输出: