按标签和类别进行数据收集

时间:2019-06-08 09:47:24

标签: html vba excel-vba web-scraping

我正在尝试从网站复制数据,我需要各种尺寸,价格,设施,特价,储备金。我在下面的代码框架,但我无法复制下面的元素,现在可以正常工作。遇到很多错误。有人可以调查一下吗?

 Sub gostoreit()

Dim ie As New InternetExplorer, ws As Worksheet
Dim element As IHTMLElement
Set ws = ThisWorkbook.Worksheets("Unit Data")
With ie
    .Visible = True
    .Navigate2 "" & 
"https://www.gostoreit.com/locations/georgia/cumming/go-store-cumming/"

    While .Busy Or .readyState < 4: DoEvents: Wend

    Dim listings As Object, listing As Object, headers(), results(), r 
As Long, c As Long, item As Object
    headers = Array("Size", "promo", "Reguler Price", "Online Price", "Listing Active", "features")
    Set listings = .document.getElementsByTagName("l-main-container")
    ReDim results(1 To listings.Length, 1 To UBound(headers) + 1)
    For Each listing In listings

        r = r + 1

        results(r, 1) = listing.getElementsByClassName("size_txt")(0).innerText 'Size
        results(r, 2) = listing.getElementsByClassName("helpDiscounts ls_discountsTitleSmall")(0).innerText 'promo(example. First Month Free)
        results(r, 3) = listing.getElementsByClassName("wasPrice")(0).innerText 'reguler price
        results(r, 4) = listing.getElementsByClassName("ls_unit_price")(0).innerText 'online price
        results(r, 5) = listing.getElementsByClassName("unitSelectButtonRES isRESBut")(0).innerText ' listing active
        results(r, 6) = listing.getElementsByClassName("tableUnitType _uSpan")(0).innerText ' features


    Next
    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
    ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    .Quit

End With
End Sub

2 个答案:

答案 0 :(得分:2)

使用iframe src,然后处理我们之前讨论过的方式(作为我的偏爱),即识别行,然后将行html转储到代理HTMLDocument变量中,以更细粒度地利用querySelector。我忽略了reserve,因为它没有变化,您可以使用默认值自动填充它们。如果需要,可以轻松添加它们。

Option Explicit

'VBE > Tools > References: Microsoft Internet Controls
Public Sub GetData()
    Dim ie As Object
    Set ie = CreateObject("InternetExplorer.Application")
    With ie
        .Visible = True
        .Navigate2 "https://www.clickandstor.com/CAS_2.5.16/sorter/controller.php?fid=1162&mode=unit-table-p&target=casDiv1&width=100%25&height=100px&js=1&displayId=lsFramer_0&u=https%3A%2F%2Fwww.gostoreit.com%2Flocations%2Fgeorgia%2Fcumming%2Fgo-store-cumming%2F&&v_in=2.5.16&dn=1559990768103&1559990768"

        While .Busy Or .readyState < 4: DoEvents: Wend

        Dim html2 As HTMLDocument, rows As Object, i As Long, results(), headers()
        headers = Array("Size", "Description", "On site price", "Web Price", "Offer")
        Set html2 = New HTMLDocument

        Do
            Set rows = .document.querySelectorAll(".unitRow") '.size_txt")
        Loop While rows.Length = 0
        ReDim results(1 To rows.Length, 1 To UBound(headers) + 1)
        On Error Resume Next
        For i = 1 To rows.Length - 1
            html2.body.innerHTML = rows.item(i).outerHTML
            results(i, 1) = html2.querySelector(".size_txt").innerText
            results(i, 2) = GetDescription(html2.querySelectorAll(".unitMoreHelpTitle, .pop_spacer_li"))
            results(i, 3) = html2.querySelector(".wasPrice").innerText
            results(i, 4) = html2.querySelector(".ls_unit_price").innerText
            results(i, 5) = html2.querySelector(".helpDiscounts").innerText
        Next
        On Error GoTo 0
        .Quit
    End With
    With ThisWorkbook.Worksheets("Sheet1")
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Public Function GetDescription(ByVal nodeList As Object)
    Dim i As Long, arr()
    ReDim arr(0 To nodeList.Length - 1)
    For i = 0 To nodeList.Length - 1
        arr(i) = nodeList.item(i).innerText
    Next
    GetDescription = Join$(arr, Chr$(32))
End Function

如果您想要使用iframe的更多详细方法。我选择导航到iframe的src,但是您可以使用.document.getElementById("lsFramer_0").contentDocument.querySelector语法进行访问

Option Explicit

'VBE > Tools > References: Microsoft Internet Controls
Public Sub GetData()
    Dim ie As Object
    Set ie = CreateObject("InternetExplorer.Application")
    With ie
        .Visible = True
        .Navigate2 "https://www.gostoreit.com/locations/georgia/cumming/go-store-cumming/"
         While .Busy Or .readyState < 4: DoEvents: Wend
        .Navigate2 .document.querySelector("#lsFramer_0").src
        While .Busy Or .readyState < 4: DoEvents: Wend

        Dim html2 As HTMLDocument, rows As Object, i As Long, results(), headers()
        headers = Array("Size", "Description", "On site price", "Web Price", "Offer")
        Set html2 = New HTMLDocument

        Do
            Set rows = .document.querySelectorAll(".unitRow") '.size_txt")
        Loop While rows.Length = 0
        ReDim results(1 To rows.Length, 1 To UBound(headers) + 1)
        On Error Resume Next
        For i = 1 To rows.Length - 1
            html2.body.innerHTML = rows.item(i).outerHTML
            results(i, 1) = html2.querySelector(".size_txt").innerText
            results(i, 2) = GetDescription(html2.querySelectorAll(".unitMoreHelpTitle, .pop_spacer_li"))
            results(i, 3) = html2.querySelector(".wasPrice").innerText
            results(i, 4) = html2.querySelector(".ls_unit_price").innerText
            results(i, 5) = html2.querySelector(".helpDiscounts").innerText
        Next
        On Error GoTo 0
        .Quit
    End With
    With ThisWorkbook.Worksheets("Sheet1")
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Public Function GetDescription(ByVal nodeList As Object)
    Dim i As Long, arr()
    ReDim arr(0 To nodeList.Length - 1)
    For i = 0 To nodeList.Length - 1
        arr(i) = nodeList.item(i).innerText
    Next
    GetDescription = Join$(arr, Chr$(32))
End Function

答案 1 :(得分:1)

嗨,我格式化的代码对我来说运行良好,直到“ ReDim结果”行

问题似乎是网页上没有“ l-main-container”元素(请参见下面的图片)

Not main-container

Sub gostoreit()

Dim ie As New InternetExplorer, ws As Worksheet
Dim element As IHTMLElement
Set ws = ThisWorkbook.Worksheets("Unit Data")
With ie
.Visible = True
.Navigate2 "" & "https://www.gostoreit.com/locations/georgia/cumming/go-store-cumming/"
While .Busy Or .readyState < 4: DoEvents: Wend

Dim listings As Object, listing As Object, headers(), results(), r As Long, c As Long, item As Object
headers = Array("Size", "promo", "Reguler Price", "Online Price", "Listing Active", "features")
Set listings = .document.getElementsByTagName("l-main-container")
ReDim results(1 To listings.Length, 1 To UBound(headers) + 1)

For Each listing In listings
  r = r + 1
  results(r, 1) = listing.getElementsByClassName("size_txt")(0).innerText 'Size
  results(r, 2) = listing.getElementsByClassName("helpDiscounts ls_discountsTitleSmall") 
 (0).innerText 'promo(example. First Month Free)
  results(r, 3) = listing.getElementsByClassName("wasPrice")(0).innerText 'reguler price
  results(r, 4) = listing.getElementsByClassName("ls_unit_price")(0).innerText 'online 
  price results
  results(r, 4)(r, 5) = listing.getElementsByClassName("unitSelectButtonRES isRESBut")(0).innerText ' listing active
  results(r, 6) = listing.getElementsByClassName("tableUnitType _uSpan")(0).innerText ' features
Next

ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
End Sub