通过遍历URL列表从网站中提取HTML代码

时间:2019-05-11 09:53:44

标签: excel vba internet-explorer web-scraping automation

我正在使用Excel VBA根据D列中每一行的URL启动IE浏览器选项卡。然后根据预定义的类提取相关的HTML代码,并将其填充在A到C列中。 / p>

很确定我错过了一步。该过程在D2处停止,并且不会继续从下一个URL(在D3,D4等单元格中)提取HTML。

提前感谢您的任何建议!

Sub useClassnames()
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
Dim IE As InternetExplorer
Dim html As HTMLDocument
Dim shellWins As New ShellWindows
Dim IE_TabURL As String
Dim intRowPosition As Integer

Set IE = New InternetExplorer
IE.Visible = False

intRowPosition = 2

Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True

IE.navigate Sheet1.Range("D" & intRowPosition)

While IE.Busy
    DoEvents
Wend

intRowPosition = intRowPosition + 1

While Sheet1.Range("D" & intRowPosition) <> vbNullString
    IE.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)

    While IE.Busy
        DoEvents
    Wend

    intRowPosition = intRowPosition + 1

Wend

Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading Web page…"
DoEvents
Loop

Set html = IE.document
Set elements = html.getElementsByClassName("container-bs")

Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "container-bs" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
count = count + 1
End If
Next element

Range("A2:C2000").Select
Columns("A:A").EntireColumn.AutoFit
Columns("B:B").ColumnWidth = 36
End Sub

1 个答案:

答案 0 :(得分:1)

您的行

Set html = IE.document
Set elements = html.getElementsByClassName("container-bs")

etc发生在While循环之后。它必须在里面。


您的If语句:

If element.className = "container-bs"

应该是多余的,因为您已经遍历了该类名的集合;所以我删除了这个。


您不是在循环中使用element,因此从本质上讲,您是在使用它来控制递增的计数器变量。这表明您可以使用更好的编码策略来检索感兴趣的项目。


始终声明父级工作表,并且不依赖隐式Activesheet引用-容易出错。


我希望结构更像如下(我无法考虑重构来删除element


Option Explicit
Public Sub UseClassnames()
    Dim element As IHTMLElement, elements As IHTMLElementCollection, ie As InternetExplorer
    Dim html As HTMLDocument, intRowPosition As Long

    intRowPosition = 2
    Set ie = CreateObject("InternetExplorer.Application")
    ie.Visible = True

    While Sheet1.Range("D" & intRowPosition) <> vbNullString

        If intRowPosition = 2 Then
            ie.navigate Sheet1.Range("D" & intRowPosition)
        Else
            ie.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)
        End If

        While ie.Busy Or ie.readyState < 4: DoEvents: Wend

        Set html = ie.document
        Set elements = html.getElementsByClassName("container-bs")

        Dim count As Long, erow As Long

        count = 0

        For Each element In elements
            erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
            With Sheet1
                .Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
                .Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
                .Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
            End With
            count = count + 1
        Next element

        intRowPosition = intRowPosition + 1
    Wend
    With Sheet1
        .Range("A2:C2000").Select
        .Columns("A:A").EntireColumn.AutoFit
        .Columns("B:B").ColumnWidth = 36
    End With
End Sub