使用Excel VB进行网络爬取getElementsByTagName(“ a”)仅在某些时间有效

时间:2019-12-14 20:50:15

标签: excel vba

我已使用以下代码成功将电子邮件中的数据提取到Excel中:

Set ElementCol = html.getElementsByTagName("a")

    For Each Link In ElementCol   

        If InStr(Link, "mailto:") Then

            Cells(MyRow, MyCol).Value = Link

            Cells(MyRow, MyCol).Select

            MyCol = MyCol + 1

        End If

    Next 'email address loop

这与:

enter image description here

但是,我不能使用类似的代码来输入名称:

enter image description here

我尝试过getElementById()getElementsByClassName,并且尝试将html.更改为doc.Text.,因为我认为问题可能是电子邮件地址是超链接,所以html.是合适的,但是名称是纯文本,所以我应该使用其他名称?我收到“对象必需”错误。 编辑:感谢您的意见。下面是我的完整代码。我将在英国下议院网站上搜集MP的详细信息。该URL有一个随机数,因此我将URL强制使用,尝试从1到5000的每个数字。我突出显示了无效部分(当我尝试此处建议的新代码时,该部分仍然无效)示例URL:{ {3}}

Sub scrapeHyperlinksWebsite()

Dim ie As InternetExplorer
Dim html As HTMLDocument
Dim Link As Object
Dim ElementCol As Object
Dim erow As Long
Dim MyURL As String
Dim MyRow, MyCol As Integer
Dim Constituency As String
Dim coll As IHTMLElementCollection ' suggestion from Stackoverflow


Application.ScreenUpdating = True

'clear existing data
Call clearRows


Set ie = New InternetExplorer
ie.Visible = False

'initiate variable
URL_Number = 0

'static elements:
URL_Str1 = "https://members.parliament.uk/member/"
URL_Str2 = "/contact"

'Begin loop down page

For MyRow = 2 To 5000

    URL_Number = URL_Number + 1

    MyCol = 1

    MyURL = URL_Str1 & URL_Number & URL_Str2

    ie.navigate MyURL

    Do While ie.readyState <> READYSTATE_COMPLETE
        Application.StatusBar = "Loading website…"
        DoEvents
    Loop

    Set html = ie.document

    Range("L2") = html.DocumentElement.innerHTML


    'Get MP email addresses
    MyCol = 1

    Set ElementCol = html.getElementsByTagName("a")

    For Each Link In ElementCol   'find emails in each site

        If InStr(Link, "mailto:") Then

            Cells(MyRow, MyCol).Value = Link

            Cells(MyRow, MyCol).Select

            MyCol = MyCol + 1

        End If

    Next 'email address loop


 MyCol = 3

 'Get MP phone numbers

    Set ElementCol = html.getElementsByTagName("a")

    For Each Link In ElementCol   'find emails in each site

        If InStr(Link, "tel:") Then

            Cells(MyRow, MyCol).Value = Link

            Cells(MyRow, MyCol).Select

            MyCol = MyCol + 1

        End If

    Next 'email address loop




 'Get Constituency - This section is not working
' MyCol = 5
'
'    Set coll = html.getElementsByTagName("h1")
'
'    For Each Link In coll   'find emails in each site
'
'        If InStr(Link, "Contact") Then
'
'            Cells(MyRow, MyCol).Value = coll
'
'            Cells(MyRow, MyCol).Select
'
'            MyCol = MyCol + 1
'
'        End If
'
'    Next 'email address loop
'

 'above section is not working






Next ' main loop





Set ie = Nothing

Application.StatusBar = ""









'Tidy data

Cells.Select

    With Selection

        .VerticalAlignment = xlBottom

        .Orientation = 0

        .AddIndent = False

        .IndentLevel = 0

        .ShrinkToFit = False

        .ReadingOrder = xlContext

        .MergeCells = False

    End With

    With Selection

        .VerticalAlignment = xlCenter

        .Orientation = 0

        .AddIndent = False

        .IndentLevel = 0

        .ShrinkToFit = False

        .ReadingOrder = xlContext

        .MergeCells = False

    End With

    Selection.RowHeight = 15

    Range("D1").Select






End Sub


Sub clearRows()

Range("A2:M50000").Select
    Selection.ClearContents
    Range("A2").Select
End Sub

1 个答案:

答案 0 :(得分:0)

The code to get the H1s works fine. There is no contact there, and so you can try the below:

MyCol = 5
Set coll = html.getElementsByTagName("h1")
For Each Link In coll   'find emails in each site
    If InStr(Link, "Contact") Then
        Cells(MyRow, MyCol).Value = Link.innerText
         MyCol = MyCol + 1
    End If
Next Link    'MP Name Loop