Question

我需要获取一些除了body标签之外的任何HTML元素都不包含的文本，但问题是文本被其他标签分解并需要进入单独的单元格。

例如：

<a id="00:00:00" class="ts">[00:00:00]</a> <font class="mn">Name1</font> First bit of text<br/>
<a id="00:00:09" class="ts">[00:00:09]</a> <font class="mn">Name2</font> Second Line of Text<br/>
<a id="00:01:17" class="ts">[00:01:17]</a> <font class="mn">Name3</font> A third line of text<br/>
<a id="00:01:59" class="ts">[00:01:59]</a> <font class="mn">Name4</font> The final line of text<br/>

我能够将时间戳和名称都添加到各自的列中，但我无法弄清楚如何将每行文本放入相应的行中。

到目前为止，这是我的代码：

Dim i As Integer
Dim Timestamp As Object
Dim Name As Object

my_url = "path_to_url.html"
Set html_doc = CreateObject("htmlfile")
Set xml_obj = CreateObject("MSXML2.XMLHTTP")

xml_obj.Open "GET", my_url, False
xml_obj.send
html_doc.body.innerHTML = xml_obj.responseText
Set xml_obj = Nothing

Set Timestamp = html_doc.body.getElementsByTagName("a")
Set Name = html_doc.body.getElementsByTagName("font")

i = 2
For Each itm In Timestamp
    If itm.getAttribute("className") = "ts" Then
        Cells(i, 1).Value = itm.innerText
        i = i + 1
    End If
Next

i = 2
For Each itm In Name
    If itm.getAttribute("className") = "mn" Then
        Cells(i, 2).Value = itm.innerText
        i = i + 1
    End If
Next

我在考虑以某种方式使用<br/>并使用LEFT，但我不确定这是否是最佳方法。提前谢谢。

Answer 1

只要这是响应中唯一的内容而且没有其他部分你可以做这样的事情：

编辑：修订为除了

以外的其他内容

Sub Tester()

    Const RW_START As Long = 5
    Const SPLITTER = "{xxxx}"
    Dim i As Integer, html_doc, itm
    Dim Timestamp As Object
    Dim Name As Object
    Dim arr, sep, txt

    Set html_doc = CreateObject("htmlfile")
    html_doc.body.innerHTML = Range("A1").Value 'for my testing...


    Set Timestamp = html_doc.body.getElementsByTagName("a")
    Set Name = html_doc.body.getElementsByTagName("font")

    i = RW_START
    For Each itm In Timestamp
        If itm.getAttribute("className") = "ts" Then
            Cells(i, 1).Value = itm.innerText
            itm.innerText = "" '<<<
            i = i + 1
        End If
    Next

    i = RW_START
    For Each itm In Name
        If itm.getAttribute("className") = "mn" Then
            Cells(i, 2).Value = itm.innerText
            itm.innerText = IIf(i = RW_START, "", SPLITTER) '<<<
            i = i + 1
        End If
    Next

    'get the remaining text and split on newline (<br>)
    arr = Split(html_doc.body.innerText, SPLITTER)
    i = RW_START
    For Each itm In arr
        itm = Trim(itm) 
        'remove trailing vbLf
        If Right(itm, 1) = vblf Then itm = Left(itm, Len(itm)-1)
        Cells(i, 3).Value = Trim(itm)
        i = i + 1
    Next

End Sub

使用VBA解析不在标记中的HTML文本

1 个答案: