网页搜罗Excel VBA

时间:2019-11-27 12:18:54

标签: html excel vba web-scraping

我想从电子商务网站中删除商品标题,价格,卖家和图片网址。需要将结果复制到A-D列中的活动工作表中。以下代码最初是由@QHarr之一为Amazon开发的。有任何帮助对其进行更新以获得所需结果的帮助吗?

我已将图像包含在结果中。谢谢

Public Sub WriteOutProductInfo()

'VBE>Tools>References> Microsoft HTML Object Library
Dim html As MSHTML.HTMLDocument

Set html = New MSHTML.HTMLDocument

With CreateObject("MSXML2.XMLHTTP")
    ' change the url for the page of amazon from where to copy data
    .Open "GET", "https://www.daraz.lk/catalog/?from=input&q=sarees&ppath=31186:3287", False
    .setRequestHeader "User-Agent", "Mozilla/5.0"
    .send
    html.body.innerHTML = .responseText
End With

' 1. declare additional headers as variable

Dim headers(), titles As Object, prices As Object, original_prices As Object
Dim seller As Object

headers = Array("Title", "Price")

With html
    Set titles = .querySelectorAll(".c3gUW0,.c13VH6")
    Set prices = .querySelectorAll(".------------")
End With

Dim results(), r As Long, priceInfo As String

ReDim results(1 To titles.Length, 1 To UBound(headers) + 1)

For r = 0 To titles.Length - 1
    results(r + 1, 1) = titles.Item(r).innerText        
Next

Dim ws As Worksheet

Set ws = ThisWorkbook.Worksheets("Sheet1")

With ws
    .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
    .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With

End sub

enter image description here

1 个答案:

答案 0 :(得分:0)

由于各种原因,您发布的宏有些令人毛骨悚然。使用它来达成您的目标:

Sub WriteOutProductInfo()

'This macro works on the sheet it's startetd from

Dim browser As Object
Dim url As String
Dim nodesAllOffers As Object
Dim nodeOneOffer As Object
Dim currentRow As Long

  'Row, for title and price for current offer
  currentRow = ActiveSheet.Cells(Rows.Count, 1).End(xlUp).Row + 1
  'Make sure that line 2 is the start line if nothing has been entered in the table yet
  If currentRow = 1 Then currentRow = 2

  'Your sample url
  'You can loop over various urls from a table with the following code
  '(Loop not included here)
  url = "https://www.daraz.lk/catalog/?from=input&q=sarees&ppath=31186:3287"

  'Initialize Internet Explorer, set visibility,
  'call URL and wait until page is fully loaded
  Set browser = CreateObject("internetexplorer.application")
  browser.Visible = False
  browser.navigate url
  Do Until browser.ReadyState = 4: DoEvents: Loop

  'Get all html elements with the css class "c3KeDq" in a node collection
  'These are all offers of the called page, which contain in each case
  'the connected information (title and price)
  '
  'This is a sample of the html code for one offer
  '<div class="c3KeDq">
  '  <div class="c3vCyH">
  '  </div>
  '  <div class="c16H9d">
  '    <a age="0" href="//www.daraz.lk/products/ladies-office-and-saree-wear-brown-i102604104-s1009196133.html?search=1"
  '       title="Ladies Office And Saree Wear - Brown">
  '      Ladies Office And Saree Wear - Brown
  '    </a>
  '  </div>
  '  <div class="c3gUW0">
  '    <span class="c13VH6">
  '      Rs. 1,150
  '    </span>
  '  </div>
  '  <div class="c3lr34">
  '  </div>
  '  <div class="c15YQ9">
  '    <span class="c2i43- c1enUu" title="Sri Lanka">
  '      Sri Lanka
  '    </span>
  '  </div>
  '  <div class="c2attd">
  '    <div class="c31VUX">
  '      <button age="0" type="button" class="ant-btn c1xzE_ ant-btn-primary ant-btn-lg">
  '        ADD TO CART
  '      </button>
  '    </div>
  '  </div>
  '</div>
  Set nodesAllOffers = browser.document.getElementsByClassName("c3KeDq")
  If Not nodesAllOffers Is Nothing Then
    'If we got the node collection
    'We step through all offers and pull the titel and price out
    For Each nodeOneOffer In nodesAllOffers
      'Get title
      ActiveSheet.Cells(currentRow, 1).Value = getValueByClassName(nodeOneOffer, "c16H9d")
      'Get price
      ActiveSheet.Cells(currentRow, 2).Value = getValueByClassName(nodeOneOffer, "c13VH6")
      'Next offer (row)
      currentRow = currentRow + 1
    Next nodeOneOffer
  End If

  'Clean up
  browser.Quit
  Set browser = Nothing
  Set nodesAllOffers = Nothing
  Set nodeOneOffer = Nothing
End Sub

此功能:

Private Function getValueByClassName(htmlNode As Object, cssClassName As String) As String

'This function works with a single node
'If htmlNode is a node collection, you will got only the first value

Dim nodeByClassName As Object
Dim resultString As String

  'Try to get wanted node
  Set nodeByClassName = htmlNode.getElementsByClassName(cssClassName)(0)
  If Not nodeByClassName Is Nothing Then
    'If we got the class node
    'Take innertext
    resultString = Trim(nodeByClassName.innertext)
  End If

  'Clean up
  Set nodeByClassName = Nothing

  'Return result string
  getValueByClassName = resultString
End Function