为什么我的VBA代码不能从网站的HTMLDoc中提取信息?

时间:2018-05-02 22:25:09

标签: html vba dom web-scraping htmldoc

我不确定为什么我的代码无法正常工作(从网站的HTMLDoc返回商家名称,电话号码和联系电话号码我试图从中提取信息。你能帮我找出我做错了什么吗(最有可能的是IHTMLElement和IHTMLElementCollection数据类型,和/或通过getElementsByTagName,getElementsByClassName等访问HTML。谢谢!!

Option Explicit

Sub FinalMantaSub()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument

IE.Visible = False
IE.navigate "https://www.manta.com/search?search_source=business&search=general+hospitals&search_location=Dallas+TX&pt=32.7825%2C-96.8207"

Do While IE.READYSTATE <> READYSTATE_COMPLETE
DoEvents
Loop

Set HTMLDoc = IE.document

Range("A3").Value = "Name"
Range("B3").Value = "Address"
Range("C3").Value = "Phone"

'variables to output on excel sheet
Dim BusinessNameFinal As String
Dim BusinessAddressFinal As String
Dim BusinessPhoneFinal As String

'variables used to create final BusinessAddress variable
Dim streetAddress As IHTMLElement
Dim addressLocality As IHTMLElement
Dim addressRegion As IHTMLElement
Dim postalCode As IHTMLElement

Dim itemprop As String
Dim itemprop2 As String

Dim BusinessNameCollection As IHTMLElementCollection
Dim BusinessName As IHTMLElement
Dim BusinessAddressCollection As IHTMLElementCollection
Dim BusinessAddress As IHTMLElement
Dim BusinessPhoneCollection As IHTMLElementCollection
Dim BusinessPhone As IHTMLElement

Dim RowNumber As Long

'get ready for business name looping
RowNumber = 4
Set BusinessName = HTMLDoc.getElementsByClassName("media-heading text-primary h4")(0).getElementsByTagName("strong").innerText
Set BusinessNameCollection = BusinessName.all

    'loop for business names
    For Each BusinessName In BusinessNameCollection
        Cells(RowNumber, 1).Value = BusinessName
        RowNumber = RowNumber + 1
    Next BusinessName

'get ready for business address looping
RowNumber = 4
itemprop = HTMLDoc.getElementsByClassName("mvm mhn").getElementsByTagName("span").getAttribute("itemprop")
    If itemprop = "streetAddress" Then
        Set streetAddress = HTMLDoc.getElementsByClassName("mvm mhn").getElementsByTagName("span").innerText
    ElseIf itemprop = "addressLocality" Then
        Set addressLocality = HTMLDoc.getElementsByTagName("span").innerText
    ElseIf itemprop = "addressRegion" Then
        Set addressRegion = HTMLDoc.getElementsByTagName("span").innerText
    ElseIf itemprop = "postalCode" Then
        Set postalCode = HTMLDoc.getElementsByTagName("span").innerText
    End If
Set BusinessAddress = streetAddress & addressLocality & addressRegion & postalCode
Set BusinessAddressCollection = BusinessAddress.all

    'loop for business addresses
    For Each BusinessAddress In BusinessAddressCollection
        BusinessAddress = streetAddress & vbNewLine & addressLocality & ", " & addressRegion & " " & postalCode
        Cells(RowNumber, 2).Value = BusinessAddress
        RowNumber = RowNumber + 1
    Next BusinessAddress

'get ready for business phone looping
RowNumber = 4
itemprop2 = HTMLDoc.getElementsByClassName("hidden-device-xs")(0).getAttribute("itemprop")
    If itemprop2 = "telephone" Then
        BusinessPhone = HTMLDoc.getElementsByClassName("hidden-device-xs")(0).getElementsByTagName("strong").innerText
    End If
Set BusinessPhone = HTMLDoc.getElementsByClassName("hidden-device-xs")(0).getElementsByTagName("strong").innerText
Set BusinessPhoneCollection = BusinessPhone.all

    'loop for business phones
    For Each BusinessPhone In BusinessPhoneCollection
        Cells(RowNumber, 3).Value = BusinessPhone
        RowNumber = RowNumber + 1
    Next BusinessPhone

Range("A1").Activate
Set HTMLDoc = Nothing

 'do some final formatting
 Range("A3").CurrentRegion.WrapText = False
 Range("A3").CurrentRegion.EntireColumn.AutoFit
 Range("A1:C1").EntireColumn.HorizontalAlignment = xlCenter
 Range("A1:D1").Merge
 Range("A1").Value = "Manta.com Business Contacts"
 Range("A1").Font.Bold = True
 Application.StatusBar = ""
 MsgBox "Done!"

 End Sub

1 个答案:

答案 0 :(得分:0)

这会提取信息。您没有在代码中循环所有结果页面或提及它,所以我已经设置它以向您展示如何执行结果的第一页。让我知道这是怎么回事。

<强>代码:

Option Explicit

Public Sub FinalMantaSub()     '<== Can't have ad blocker enabled for this site

    Dim IE As New SHDocVw.InternetExplorer
    Dim HTMLDoc As MSHTML.HTMLDocument

    IE.Visible = True
    IE.navigate "https://www.manta.com/search?search_source=business&search=general+hospitals&search_location=Dallas+TX&pt=32.7825%2C-96.8207"

    Do While IE.readyState <> READYSTATE_COMPLETE
        DoEvents
    Loop

    Set HTMLDoc = IE.document

    Dim c As Object, i As Long

    Set c = HTMLDoc.querySelectorAll("div.media-body")

    Do While Not c(i) Is Nothing
        Debug.Print "Result #" & i + 1
        Debug.Print vbNewLine
        Debug.Print "Name: " & c(i).querySelector("[itemprop=""name""]").innerText
        Debug.Print "Address: " & c(i).querySelector("[itemprop=""address""]").innerText
        Debug.Print "Phone: " & c(i).querySelector("[itemprop=""telephone""]").innerText
        Debug.Print String$(20, Chr$(61))
        i = i + 1
    Loop
    IE.Quit
End Sub

输出快照:

Snapshot

更新:

有大量结果,但您可以按如下方式进行外循环。然后,您可以将上面的内容转换为被调用的子。

    Dim arr() As String, pageNo As Long
    arr = Split(HTMLDoc.querySelector(".pagination.pagination-md.mll a").href, "&pt")
    pageNo = 1

    Do While Err.Number = 0

        On Error GoTo Errhand:

        Dim url As String
        url = Split(arr(0), "&")(0) & "&pg=" & pageNo & "&pt" & arr(1)
        Debug.Print url
        IE.navigate url
        Do While IE.readyState <> READYSTATE_COMPLETE
            DoEvents
        Loop
        pageNo = pageNo + 1
    Loop

Errhand:
    Debug.Print "Stopped after " & pageNo & " pages."