VBA Web抓取 - 在缺少某些数据时将数据分配给正确的行

时间:2016-09-08 23:07:33

标签: vba web-scraping

下面的宏应该填充网站上的搜索参数并从结果中抓取数据 - 131页,通常每页5行数据。为了测试我注释掉了循环,所以它只做了1页。

以下数据列填写在Excel工作表中:公司名称,联系人姓名,联系电子邮件,位置,电话(BH),移动,服务。

我只有Phone(BH)和Mobile才有问题。原因是公司有时只有电话(BH)或手机,而不是两者。

我循环的方式(5个公司名称,然后5个联系人姓名等)要求检测所有数据,否则我不知道该数据应该在哪一行。

Option Explicit

Private Sub CommandButton1_Click()

Dim htmlele As IHTMLElement
Dim Link As String
Dim ie As Object
Dim page As Integer
Dim companyresult As Long, nameresult As Long, emailresult As Long, locationresult As Long, servicesresult As Long

Application.ScreenUpdating = False

Link = "http://bdav.org.au/find"

Set ie = CreateObject("InternetExplorer.Application")

ie.Navigate Link
'ie.Visible = True

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "postcode" Then htmlele.Value = "3000"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "dist" Then htmlele.Value = "99"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Type = "submit" Then htmlele.Click
Next htmlele

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

'Do

    page = page + 1
    Link = "http://bdav.org.au/find/" & page

    Do Until ie.ReadyState = 4 And ie.Busy = False
        DoEvents
    Loop

    With ThisWorkbook.Worksheets("results")
        For Each htmlele In ie.Document.getElementsByTagName("h2")
            companyresult = companyresult + 1
            .Range("a" & companyresult + 1).Value = htmlele.innerHTML
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                nameresult = nameresult + 1
                .Range("b" & nameresult + 1).Value = htmlele.innerHTML
            End If
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                emailresult = emailresult + 1
                .Range("c" & emailresult + 1).Value = Mid(htmlele.href, 8)
            End If

        Next htmlele


        For Each htmlele In ie.Document.getElementsByTagName("label")

            If htmlele.innerHTML = "Location:" Then
                locationresult = locationresult + 1
                .Range("d" & locationresult + 1).Value = htmlele.NextSibling.innerHTML
            End If

        Next htmlele

           For Each htmlele In ie.Document.getElementsByTagName("label")

            If htmlele.innerHTML = "Services:" Then
                servicesresult = servicesresult + 1
                If Right(htmlele.NextSibling.innerHTML, 4) = "<br>" Then
                    htmlele.NextSibling.innerHTML = Left(htmlele.NextSibling.innerHTML, Len(htmlele.NextSibling.innerHTML) - 4)
                End If
                .Range("g" & servicesresult + 1).Value = Replace(htmlele.NextSibling.innerHTML, "<br>", vbLf)
            End If

        Next htmlele


    End With

'Loop

ie.Quit

Set ie = Nothing

Application.ScreenUpdating = True

End Sub

2 个答案:

答案 0 :(得分:0)

考虑继续并解析 Phone Mobile 数据(假设它们存在于页面上),然后运行条件错误句柄以说明潜在的不存在。具体来说,对于未设置的对象变量,错误句柄将使用Resume Next(在问题行后继续)Error 91

至于对齐行,尝试使用不同的迭代器,phoneresult = phoneresult + 1如果随机丢失则不可靠。因此,使用servicesresult或任何始终存在的节点都是电话移动的兄弟节点。以下不是确切的代码,但包含在内以证明概念。

Private Sub CommandButton1_Click()
On Error Goto ErrHandle
        '...    
        For Each htmlele In ie.Document.getElementsByTagName("Phone")
            If htmlele.innerHTML = "Phone (BH):" Then
                '...other needed code
                .Range("h" & servicesresult + 1).Value = htmlele.innerHTML
            End If    
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("Mobile")
            If htmlele.innerHTML = "Mobile:" Then
                '...other needed code
                .Range("i" & servicesresult + 1).Value = htmlele.innerHTML
            End If    
        Next htmlele
        '...
        Exit Sub

ErrHandle:
    ' MISSING NODE ERROR
    If Err.Number = 91 Then
        Resume Next
    ' ALL OTHER ERRORS
    Else:
        MsgBox Err.Number & " - " & Err.Description
        Exit Sub
    End If
End Sub

答案 1 :(得分:0)

我找到了一种比我之前尝试过的方法更清晰的解决方案,只有1个循环通过标签标签元素集合。我根据位置标签外观取一个行号(这个标签总是出现在公司)。如果缺少公司的任何其他标签,则不会将其放在此行中。

Option Explicit

Private Sub CommandButton1_Click()

Dim htmlele As IHTMLElement
Dim Link As String
Dim ie As Object
Dim page As Integer
Dim companyresult As Long, nameresult As Long, emailresult As Long, labelresult As Long

Application.ScreenUpdating = False

Link = "http://bdav.org.au/find"

Set ie = CreateObject("InternetExplorer.Application")

ie.Navigate Link
'ie.Visible = True

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "postcode" Then htmlele.Value = "3000"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "dist" Then htmlele.Value = "99"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Type = "submit" Then htmlele.Click
Next htmlele

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

Do

    page = page + 1
    Link = "http://bdav.org.au/find/" & page

    ie.Navigate Link

    Do Until ie.ReadyState = 4 And ie.Busy = False
        DoEvents
    Loop

    With ThisWorkbook.Worksheets("results")
        For Each htmlele In ie.Document.getElementsByTagName("h2")
            companyresult = companyresult + 1
            .Range("a" & companyresult + 1).Value = htmlele.innerHTML
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                nameresult = nameresult + 1
                .Range("b" & nameresult + 1).Value = htmlele.innerHTML
            End If
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                emailresult = emailresult + 1
                .Range("c" & emailresult + 1).Value = Mid(htmlele.href, 8)
            End If

        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("label")

            If htmlele.innerHTML = "Location:" Then
                labelresult = labelresult + 1
                .Range("d" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
            ElseIf htmlele.innerHTML = "Phone (BH):" Then
                .Range("e" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
            ElseIf htmlele.innerHTML = "Mobile:" Then
                .Range("f" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
            ElseIf htmlele.innerHTML = "Services:" Then
                If Right(htmlele.NextSibling.innerHTML, 4) = "<br>" Then
                    htmlele.NextSibling.innerHTML = Left(htmlele.NextSibling.innerHTML, Len(htmlele.NextSibling.innerHTML) - 4)
                End If
                .Range("g" & labelresult + 1).Value = Replace(htmlele.NextSibling.innerHTML, "<br>", vbLf)
            End If

        Next htmlele

    End With

Loop

ie.Quit

Set ie = Nothing

Application.ScreenUpdating = True

End Sub