无法使用vba排除<li>标记内的标记

时间:2018-12-28 21:54:14

标签: excel vba excel-vba web-scraping

我有如下几页

https://www.skroutz.gr/s/2195774/Western-Digital-Blue-3-5-1TB-7200rpm.html

我想用vba数据提取价格,可用性,卖方名称等数据 当我尝试以下

body{
  margin: 0;
  padding: 0;
  background-image: url("img/Mountain.jpg");
  background-size: cover;
  background-attachment: fixed;
  background-position: center;
  background-repeat: no-repeat;
}

我知道

“ li”标签的整个数据如何排除所有其他数据并保留我想要的内容?

1 个答案:

答案 0 :(得分:1)

这使用循环和滚动来生成项的完整列表,然后通过各种CSS选择器来定位特定信息

Option Explicit
Public Sub GetInfo()
    Dim ie As New InternetExplorer, i As Long
    Const MAX_WAIT_SEC As Long = 20

    With ie
        .Visible = True
        .Navigate2 "https://www.skroutz.gr/s/2195774/Western-Digital-Blue-3-5-1TB-7200rpm.html"

        While .Busy Or .readyState < 4: DoEvents: Wend

        Dim finalPrices As Object, sellers As Object, availability As Object
        Dim products As Object, t As Date
        Set products = .document.querySelectorAll(".card.js-product-card")
        t = Timer
        Do
            DoEvents
            ie.document.parentWindow.execScript "window.scrollBy(0, window.innerHeight);", "javascript"
            Set finalPrices = .document.querySelectorAll(".card.js-product-card span.final-price")
            Application.Wait Now + TimeSerial(0, 0, 1)
            If Timer - t > MAX_WAIT_SEC Then Exit Do
        Loop Until finalPrices.Length = products.Length

        Set sellers = .document.querySelectorAll(".card.js-product-card .shop.cf a[title]")
        Set availability = .document.querySelectorAll(".card.js-product-card span.availability")

        With ThisWorkbook.Worksheets("Sheet1")
            For i = 0 To sellers.Length - 1
                .Cells(i + 1, 1) = sellers.item(i)
                .Cells(i + 1, 2) = finalPrices.item(i).innerText
                .Cells(i + 1, 3) = availability.item(i).innerText
            Next
        End With
        .Quit
    End With
End Sub