我正在尝试从网页中分离出2个不同的innerText
字符串,但是无法将它们分开。所有标签的innerText
作为一个整体出现。
日期和季节编号是问题。
我正在使用getElementById
,这给了我一个要素。带有div
“ next_episode”的id
看起来像2个我感兴趣的内部文本条目。当我遍历其子级的内部文本时,将跳过这2个条目。我无法弄清楚如何仅将“ next_episode”标记分离出两个不同的innerText
条目。我通过在代码返回的数组中使用索引号来隔离所需的文本。
Dim IE_00 As SHDocVw.InternetExplorer
Dim HTMLDoc_00 As MSHTML.HTMLDocument
Set IE_00 = New SHDocVw.InternetExplorer
IE_00.Visible = True
IE_00.navigate "https://next-episode.net/final-space"
Do While IE_00.readyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc_00 = IE_00.document
Dim NETC_05 As MSHTML.IHTMLElementCollection
Dim NET_05 As MSHTML.IHTMLElement
'Can loop through the inner text of the children one by one and find what
I need
Set NETC_05 = HTMLDoc_00.getElementById("next_episode").Children
For Each NET_05 In NETC_05
Debug.Print NET_05.innerText
Next NET_05
'This just gives a big block of text that includes the missing inner text
I need
Set NET_05 = HTMLDoc_00.getElementById("next_episode")
Debug.Print NET_05.innerText
答案 0 :(得分:1)
数据(大部分)在NextSiblings
中:
Node.nextSibling只读属性立即返回节点 在其父节点的childNode中跟随指定的节点,否则返回 如果指定的节点是父元素中的最后一个子元素,则为null。 * 1
您可以编写一个函数,例如GetNextSiblings
,该函数检查当前节点的特定搜索字符串,然后从NextSibling
中提取所需的值。我对输出列进行了重新排序,以减少代码量,但是您可以轻松地循环另一个标头数组,并使用该顺序从dict info
访问以不同的顺序写出值。我通过字典中键的输入顺序确定输出顺序。我循环执行headers数组以填充dict键,然后使用抓取的值更新dict。
由于不需要的内容不会动态加载,因此不需要浏览器的开销。一个简单且速度更快的xhr请求就足够了。
旁注:
对于这种类型的页面,我建议使用Python 3和BeautifulSoup(bs4 4.7.1+),因为这样可以访问伪选择器:contains
。这样,代码可以更加简洁,程序速度更快。我将在最后显示。
VBA:
Option Explicit
Public Sub GetShowInfo()
Dim html As MSHTML.HTMLDocument, headers(), i As Long, aCollection As Object, info As Object
headers = Array("Name:", "Countdown:", "Date:", "Season:", "Episode:", "Status:")
Set html = New HTMLDocument
With CreateObject("Msxml2.xmlhttp")
.Open "GET", "https://next-episode.net/final-space", False
.send
html.body.innerHTML = .responseText
End With
Set info = CreateObject("Scripting.Dictionary")
For i = LBound(headers) To UBound(headers)
info(headers(i)) = vbNullString
Next
info("Name:") = html.querySelector("#next_episode .sub_main").innerText
info("Countdown:") = html.querySelector("#next_episode span").innerText
Set aCollection = html.getElementById("middle_section").getElementsByTagName("div")
Set info = GetNextSiblings(aCollection, headers, info)
Set aCollection = html.getElementById("next_episode").getElementsByTagName("div")
Set info = GetNextSiblings(aCollection, headers, info)
With ThisWorkbook.Worksheets("Sheet1")
.Cells(1, 1).Resize(1, info.Count) = info.keys
.Cells(2, 1).Resize(1, info.Count) = info.items
End With
End Sub
Public Function GetNextSiblings(ByVal aCollection As Object, ByRef headers(), ByVal info As Object) As Object
Dim item As Object, i As Long
For Each item In aCollection
For i = 2 To UBound(headers)
If InStr(item.outerHTML, headers(i)) > 0 Then
If headers(i) = "Episode:" Then
info(headers(i)) = item.NextSibling.innerText
Else
info(headers(i)) = item.NextSibling.NodeValue
End If
Exit For
End If
Next
Next
Set GetNextSiblings = info
End Function
阅读:
Python(带有bs4 4.7.1 +):
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://next-episode.net/final-space')
soup = bs(r.content, 'lxml')
current_nodes = ['Status:','Name:', 'Countdown:','Date:','Season:','Episode:']
for node in current_nodes:
selector = f'#middle_section div:contains("{node}"), #next_episode div:contains("{node}")'
if node in ['Episode:','Name:']:
print(node, soup.select_one(selector).text.replace(node,''))
elif node == 'Countdown:':
print(node, soup.select_one(selector).next_sibling.text)
else:
print(node, soup.select_one(selector).next_sibling)
答案 1 :(得分:0)
'Setting XML 05 as an Object
Dim XML_05 As New MSXML2.XMLHTTP60
'Setting HTML Document 05 as an Object
Dim HTML_05 As New MSHTML.HTMLDocument
XML_05.Open "GET", Cells(Row, NextEpisodeURL).Value, False
XML_05.send
HTML_05.body.innerHTML = XML_05.responseText
'Setting Net Element Tag Collection 05 as an Object
Dim NETC_05 As MSHTML.IHTMLElementCollection
'Setting Net Element Tag 05 as an Object
Dim NET_05 As MSHTML.IHTMLElement
'Setting Reg EX 05 as an Object
Dim REO_05 As VBScript_RegExp_55.RegExp
'Setting Match Object 05 as Object
Dim MO_05 As Object
'Setting Season array as Array
Dim SN_05() As String
'Setting Episode Name 05 as Array
Dim ENA_05() As String
'Setting Episode Number 05 as Array
Dim EN_05() As String
'Getting Episode Name Episode Number and Season Number From Net
'Set NETC_05 = HTML_05.getElementsByClassName("sub_main")
Set NET_05 = HTML_05.getElementById("previous_episode")
Set REO_05 = New VBScript_RegExp_55.RegExp
REO_05.Global = True
REO_05.IgnoreCase = True
'Getting Episode Name
REO_05.Pattern = "(Name:(.*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
ENA_05 = Split(MO_05(0), ":")
Debug.Print ENA_05(1)
Cells(Row, NextEpName).Value = ENA_05(1)
'Getting Episode Number
REO_05.Pattern = "(Episode:([0-9]*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
EN_05 = Split(MO_05(0), ":")
Debug.Print EN_05(1)
Cells(Row, EpisodeNet).Value = EN_05(1)
'Getting Season Number
REO_05.Pattern = "(Season:([0-9]*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
SN_05 = Split(MO_05(0), ":")
Debug.Print SN_05(1)
Cells(Row, SeasonNet).Value = SN_05(1)
'Getting Countdown From Net
Set NETC_05 = HTML_05.getElementById("next_episode").Children
Cells(Row, Countdown).Value = NETC_05(5).innerText
Debug.Print NETC_05(5).innerText