使用get elementbyID时如何隔离多个内部文本条目

时间:2019-08-29 18:25:28

标签: html excel vba web-scraping screen-scraping

我正在尝试从网页中分离出2个不同的innerText字符串,但是无法将它们分开。所有标签的innerText作为一个整体出现。 日期和季节编号是问题。

我正在使用getElementById,这给了我一个要素。带有div“ next_episode”的id看起来像2个我感兴趣的内部文本条目。当我遍历其子级的内部文本时,将跳过这2个条目。我无法弄清楚如何仅将“ next_episode”标记分离出两个不同的innerText条目。我通过在代码返回的数组中使用索引号来隔离所需的文本。

Dim IE_00 As SHDocVw.InternetExplorer
Dim HTMLDoc_00 As MSHTML.HTMLDocument
Set IE_00 = New SHDocVw.InternetExplorer
IE_00.Visible = True

IE_00.navigate "https://next-episode.net/final-space"
Do While IE_00.readyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc_00 = IE_00.document

Dim NETC_05 As MSHTML.IHTMLElementCollection
Dim NET_05 As MSHTML.IHTMLElement

'Can loop through the inner text of the children one by one and find what 
I need

Set NETC_05 = HTMLDoc_00.getElementById("next_episode").Children

For Each NET_05 In NETC_05
Debug.Print NET_05.innerText
Next NET_05

'This just gives a big block of text that includes the missing inner text 
I need

Set NET_05 = HTMLDoc_00.getElementById("next_episode")
Debug.Print NET_05.innerText

2 个答案:

答案 0 :(得分:1)

数据(大部分)在NextSiblings中:

enter image description here

  

Node.nextSibling只读属性立即返回节点   在其父节点的childNode中跟随指定的节点,否则返回   如果指定的节点是父元素中的最后一个子元素,则为null。    * 1


您可以编写一个函数,例如GetNextSiblings,该函数检查当前节点的特定搜索字符串,然后从NextSibling中提取所需的值。我对输出列进行了重新排序,以减少代码量,但是您可以轻松地循环另一个标头数组,并使用该顺序从dict info访问以不同的顺序写出值。我通过字典中键的输入顺序确定输出顺序。我循环执行headers数组以填充dict键,然后使用抓取的值更新dict。

由于不需要的内容不会动态加载,因此不需要浏览器的开销。一个简单且速度更快的xhr请求就足够了。


旁注:

对于这种类型的页面,我建议使用Python 3和BeautifulSoup(bs4 4.7.1+),因为这样可以访问伪选择器:contains。这样,代码可以更加简洁,程序速度更快。我将在最后显示。


VBA:

Option Explicit
Public Sub GetShowInfo()
    Dim html As MSHTML.HTMLDocument, headers(), i As Long, aCollection As Object, info As Object

    headers = Array("Name:", "Countdown:", "Date:", "Season:", "Episode:", "Status:")
    Set html = New HTMLDocument

    With CreateObject("Msxml2.xmlhttp")
        .Open "GET", "https://next-episode.net/final-space", False
        .send
        html.body.innerHTML = .responseText
    End With

    Set info = CreateObject("Scripting.Dictionary")

    For i = LBound(headers) To UBound(headers)
        info(headers(i)) = vbNullString
    Next

    info("Name:") = html.querySelector("#next_episode .sub_main").innerText
    info("Countdown:") = html.querySelector("#next_episode span").innerText
    Set aCollection = html.getElementById("middle_section").getElementsByTagName("div")
    Set info = GetNextSiblings(aCollection, headers, info)
    Set aCollection = html.getElementById("next_episode").getElementsByTagName("div")
    Set info = GetNextSiblings(aCollection, headers, info)

    With ThisWorkbook.Worksheets("Sheet1")
        .Cells(1, 1).Resize(1, info.Count) = info.keys
        .Cells(2, 1).Resize(1, info.Count) = info.items
    End With
End Sub

Public Function GetNextSiblings(ByVal aCollection As Object, ByRef headers(), ByVal info As Object) As Object
    Dim item As Object, i As Long
    For Each item In aCollection
        For i = 2 To UBound(headers)
            If InStr(item.outerHTML, headers(i)) > 0 Then
                If headers(i) = "Episode:" Then
                    info(headers(i)) = item.NextSibling.innerText
                Else
                    info(headers(i)) = item.NextSibling.NodeValue
                End If
                Exit For
            End If
        Next
    Next
    Set GetNextSiblings = info
End Function

阅读:

  1. NextSibling
  2. CSS selectors
  3. querySelector

Python(带有bs4 4.7.1 +):

import requests
from bs4 import BeautifulSoup as bs

r = requests.get('https://next-episode.net/final-space')
soup = bs(r.content, 'lxml')    
current_nodes = ['Status:','Name:', 'Countdown:','Date:','Season:','Episode:']

for node in current_nodes:
    selector = f'#middle_section div:contains("{node}"), #next_episode div:contains("{node}")'
    if node in ['Episode:','Name:']:
        print(node, soup.select_one(selector).text.replace(node,''))
    elif node == 'Countdown:':
         print(node, soup.select_one(selector).next_sibling.text)
    else:
        print(node, soup.select_one(selector).next_sibling)

答案 1 :(得分:0)

'Setting XML 05 as an Object
    Dim XML_05 As New MSXML2.XMLHTTP60
'Setting HTML Document 05 as an Object
    Dim HTML_05 As New MSHTML.HTMLDocument

    XML_05.Open "GET", Cells(Row, NextEpisodeURL).Value, False
    XML_05.send
    HTML_05.body.innerHTML = XML_05.responseText

'Setting Net Element Tag Collection 05 as an Object
    Dim NETC_05 As MSHTML.IHTMLElementCollection
'Setting Net Element Tag 05 as an Object
    Dim NET_05 As MSHTML.IHTMLElement
'Setting Reg EX 05 as an Object
    Dim REO_05 As VBScript_RegExp_55.RegExp
'Setting Match Object 05 as Object
    Dim MO_05 As Object
'Setting Season array as Array
    Dim SN_05() As String
'Setting Episode Name 05 as Array
    Dim ENA_05() As String
'Setting Episode Number 05 as Array
    Dim EN_05() As String

'Getting Episode Name Episode Number and Season Number From Net

'Set NETC_05 = HTML_05.getElementsByClassName("sub_main")
    Set NET_05 = HTML_05.getElementById("previous_episode")
    Set REO_05 = New VBScript_RegExp_55.RegExp
        REO_05.Global = True
        REO_05.IgnoreCase = True

'Getting Episode Name
    REO_05.Pattern = "(Name:(.*))"
        Set MO_05 = REO_05.Execute(NET_05.innerText)
            Debug.Print MO_05.Count
            Debug.Print MO_05(0).Value
                ENA_05 = Split(MO_05(0), ":")
            Debug.Print ENA_05(1)
            Cells(Row, NextEpName).Value = ENA_05(1)

'Getting Episode Number
    REO_05.Pattern = "(Episode:([0-9]*))"
        Set MO_05 = REO_05.Execute(NET_05.innerText)
            Debug.Print MO_05.Count
            Debug.Print MO_05(0).Value
                EN_05 = Split(MO_05(0), ":")
            Debug.Print EN_05(1)
            Cells(Row, EpisodeNet).Value = EN_05(1)

'Getting Season Number
    REO_05.Pattern = "(Season:([0-9]*))"
        Set MO_05 = REO_05.Execute(NET_05.innerText)
            Debug.Print MO_05.Count
            Debug.Print MO_05(0).Value
                SN_05 = Split(MO_05(0), ":")
            Debug.Print SN_05(1)
            Cells(Row, SeasonNet).Value = SN_05(1)

'Getting Countdown From Net
    Set NETC_05 = HTML_05.getElementById("next_episode").Children
        Cells(Row, Countdown).Value = NETC_05(5).innerText
        Debug.Print NETC_05(5).innerText