无法解析网页中的所有链接

时间:2017-06-16 19:58:13

标签: vba web-scraping web-crawler

无法弄清楚如何从我的代码中使用的页面获取所有公司链接。运行我的脚本我只得到20个链接。页面有懒惰加载方法,这就是为什么我无法获得所有这些方法。对此的任何意见将受到高度赞赏。我到目前为止尝试过:

Sub Company_links()
Const lnk = "http://fortune.com"
Dim http As New XMLHTTP60, html As New HTMLDocument
Dim topic As Object

With http
    .Open "GET", "http://fortune.com/fortune500/list/", False
    .send
    html.body.innerHTML = .responseText
End With

For Each topic In html.getElementsByClassName("small-12 column row")
    x = x + 1
    With topic.getElementsByTagName("a")
        If .Length Then Cells(x, 1) = lnk & Split(.item(0).href, "about:")(1)
    End With
Next topic

Set html = Nothing: Set topics = Nothing
End Sub

3 个答案:

答案 0 :(得分:1)

在新工作簿中运行以下代码。它会将结果输出到Sheet1而不管它们是否为空,所以如果你有数据则要小心。您可以稍后根据需要更改此部分代码。

首先,您需要在VBA编辑器中从Microsoft HTML Object Library激活Microsoft Internet ControlsTools -> References。然后运行以下代码,高枕无忧,直到看到“All Done!”消息:

Sub Company_links()
    Dim i As Long
    Dim aIE As InternetExplorer
    Dim Rank As IHTMLElement, Company As IHTMLElement, Revenues As IHTMLElement
    Set aIE = New InternetExplorer
    With aIE
        .navigate "http://fortune.com/fortune500/list/"
        .Visible = True
    End With

    Do While (aIE.Busy Or aIE.ReadyState <> READYSTATE_COMPLETE)
        DoEvents
    Loop

    For i = 1 To 50

        On Error Resume Next
        Set Rank = aIE.document.getElementsByClassName("column small-2 company-rank")(999)
        If Rank Is Nothing Then
            GoTo Skip
        End If
        Exit For
Skip:
    SendKeys "{end}"
    Application.Wait (Now() + TimeValue("00:00:005"))
    Next i

    With Sheet1
        .Range("A1") = "RANK"
        .Range("B1") = "COMPANY"
        .Range("C1") = "REVENUE"

        For i = 0 To 999
            Set Rank = aIE.document.getElementsByClassName("column small-2 company-rank")(i)
            Set Company = aIE.document.getElementsByClassName("column small-5 company-title")(i)
            Set Revenues = aIE.document.getElementsByClassName("column small-5 company-revenue")(i)
            .Range("A" & i + 2) = Rank.innerText
            .Range("B" & i + 2) = Company.innerText
            .Range("C" & i + 2) = Revenues.innerText
        Next i

    End With

    SendKeys "%{F4}"
    Set aIE = Nothing
    Set Rank = Nothing
    Set Company = Nothing
    Set Revenues= Nothing
    MsgBox "All Done!"
End Sub

答案 1 :(得分:0)

如果该网站使用ajax加载其余链接。您需要先使页面加载剩余的链接。我的建议是使用selenium加载页面,然后使用你的代码来获取链接。

http://selenium-python.readthedocs.io/

答案 2 :(得分:0)

我会这样做。

Option Explicit

Sub Sample()
    Dim ie As Object
    Dim links As Variant, lnk As Variant
    Dim rowcount As Long

    Set ie = CreateObject("InternetExplorer.Application")
    ie.Visible = True
    ie.navigate "http://fortune.com"

    'Wait for site to fully load
    'ie.Navigate2 URL
    Do While ie.Busy = True
       DoEvents
    Loop

    Set links = ie.document.getElementsByTagName("a")

    rowcount = 1

    With Sheets("Sheet1")
        For Each lnk In links
        'Debug.Print lnk.innerText
            'If lnk.classname Like "*Real Statistics Examples Part 1*" Then
                .Range("A" & rowcount) = lnk.innerText
                rowcount = rowcount + 1
                'Exit For
            'End If
        Next
    End With
End Sub