交叉发布在这里:https://www.mrexcel.com/board/threads/storefront-web-scraping.1120494/#post-5403849
大家好。我在VBA中创建可以处理以下内容的网络抓取程序时遇到问题。 所以基本上我需要刮我的网上商店 https://www.ebay.com/str/customwheelandperformancedepot?_pgn=1 进入Excel。
我需要浏览所有可用页面(在底部)并打开每个列表。 现在,一旦清单打开,我们需要确定它是车轮还是车轮和轮胎包装 为此,我们可以在“项目详细信息”表中查看;如果其中任何一个项目的具体内容包含“轮胎”,“截面宽度”或“长宽比”,则其车轮和轮胎包装。
对于车轮和轮胎包装,我只需要:
1.标题[#itemTitle]3.价格[#mm-saleOrgPrc](如果无法使用[#prcIsum]
) 4.易趣商品编号[#descItemNumber]
5.描述[#ds_div]
的HTML内部
我只需要车轮清单:
1.标题[#itemTitle]
2.价格[#mm-saleOrgPrc](如果没有[#prcIsum]) 3.易趣商品编号[#descItemNumber]
4.项目详细信息表[.section> table:nth-child(2)> tbody:nth-child(1)]
6. HTML内部描述[#container]
**请注意,“项目详细信息”表可能未按顺序排列,并且可能缺少某些值(例如螺栓样式2)。标头值位于第1列和第3列(条件,后退间距,偏移量等)中,而要放入excel工作表的实际值位于第2列和第4列(New,4.5、0等)中
这是我要使用3个示例车轮的结果,然后是3个示例车轮和轮胎包装的结果。
这是我到目前为止所拥有的,因为我不知道如何访问多个页面,我试图从一个列表中仅刮取标题,而且看来我也在为此而苦苦挣扎。
Option ExplicitConst sSiteName = "https://www.ebay.com/itm/1-New-20x8-5-Kmc-District-ET-35-Bronze-5x114-3-5X4-5-Wheel-Rim/372780750649?epid=24031177590&hash=item56cb76d739:g:yDYAAOSwE91diN8Q"
Private Sub GetHTMLContents() ' Create Internet Explorer object. Dim IE As Object Set IE = CreateObject("InternetExplorer.Application") IE.Visible = False ' Keep this hidden.
IE.Navigate sSiteName ' Wait till IE is fully loaded. While IE.ReadyState <> 4 DoEvents Wend Dim oHDoc As HTMLDocument ' Create document object. Set oHDoc = IE.Document Dim oHEle As HTMLDivElement ' Create HTML element (<ul>) object. Set oHEle = oHDoc.getElementById(".vi-swc-lsp") ' Get the element reference using its ID. Dim iCnt As Integer ' Loop through elements inside the <ul> element and find <h2>, which has the texts we want. With oHEle For iCnt = 0 To .getElementsByTagName("h1").Length - 1 Debug.Print .getElementsByTagName("h1").Item(iCnt).getElementsByTagName("a").Item(0).innerHTML Next iCnt End With ' Clean up. IE.Quit Set IE = Nothing Set oHEle = Nothing Set oHDoc = Nothing
结束字幕
.getelementsbytagname行上显示“对象变量或未设置块变量”
答案 0 :(得分:1)
我知道您已经在两周前问过这个问题,但是也许您还在寻找答案。
在撰写本文时,我认为以下代码对我有用。我之所以这么说是因为我有一种印象,id
(在从服务器接收的HTML中)中的某些周期性地变化–从而破坏了代码。
这是我目前所拥有的:
代码有点混乱,随时可以重构。入口点是ScrapeAllItemsFromEbayShop
。
Option Explicit
Private Function GetUrlForShopPageN(ByVal N As Long) As String
' Should return the store URL for page N,
' where N is some 1-based page index present in the query string.
GetUrlForShopPageN = "https://www.ebay.com/str/customwheelandperformancedepot?_pgn=" & N
End Function
Private Function GetHtmlForShopPageN(ByVal webClient As WinHttp.WinHttpRequest, ByVal N As Long) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for page N,
' where N is some 1-based page index present in the query string.
Dim targetUrl As String
targetUrl = GetUrlForShopPageN(N)
With webClient
.Open "GET", targetUrl, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForShopPageN = New MSHTML.HTMLDocument
GetHtmlForShopPageN.body.innerHTML = .responseText
End With
End Function
Private Function DoesShopPageNotContainResults(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return a boolean representing whether the htmlResponse contains zero results.
DoesShopPageNotContainResults = (htmlResponse.getElementsByClassName("srp-controls").Length = 0)
End Function
Private Function GetUrlsOfItemsToScrape() As Collection
' Should return a collection of strings, representings the URLs of items.
Set GetUrlsOfItemsToScrape = New Collection
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Do While True
Dim pageIndex As Long
pageIndex = pageIndex + 1
Dim htmlResponse As MSHTML.HTMLDocument
Set htmlResponse = GetHtmlForShopPageN(webClient, pageIndex)
If DoesShopPageNotContainResults(htmlResponse) Then Exit Do
Dim anchor As MSHTML.IHTMLElement
For Each anchor In htmlResponse.getElementsByClassName("s-item__link")
Debug.Assert StrComp(LCase$(Left$(anchor.getAttribute("href"), 25)), "https://www.ebay.com/itm/", vbBinaryCompare) = 0
GetUrlsOfItemsToScrape.Add anchor.getAttribute("href")
If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do ' Delete this line completely once you think everything is working.
Next anchor
If (0 = (pageIndex Mod 10)) Then DoEvents
Loop
End Function
Private Function DoesTextContainAnyOf(ByVal textToCheck As String, stringsToCheck As Variant) As Boolean
' Should return a boolean representing whether any of "stringsToCheck"
' can be found within "textToCheck". Performs a case-sensitive search.
Dim i As Long
For i = LBound(stringsToCheck) To UBound(stringsToCheck)
If InStr(1, textToCheck, stringsToCheck(i), vbBinaryCompare) Then
DoesTextContainAnyOf = True
Exit For
End If
Next i
End Function
Private Function IsItemAWheelOnly(ByVal htmlResponse As MSHTML.HTMLDocument) As Boolean
' Should return True if, based on the HTML, the item is inferred to be a "wheel".
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlResponse.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tireAndPackageIdentifiers As Variant
tireAndPackageIdentifiers = Array("tire", "section width", "aspect ratio")
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
If DoesTextContainAnyOf(LCase$(tableRow.Cells(columnIndex).innerText), tireAndPackageIdentifiers) Then Exit Function
Next columnIndex
Next tableRow
IsItemAWheelOnly = True
End Function
Private Function GetHtmlForItem(ByVal webClient As WinHttp.WinHttpRequest, ByVal urlForItem As String) As MSHTML.HTMLDocument
' Should return a HTML document representing the response of server for a given item.
With webClient
.Open "GET", urlForItem, False
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
.send
Set GetHtmlForItem = New MSHTML.HTMLDocument
GetHtmlForItem.body.innerHTML = .responseText
End With
End Function
Private Sub ScrapeAllItemsFromEbayShop()
Dim webClient As WinHttp.WinHttpRequest
Set webClient = New WinHttp.WinHttpRequest
Dim urlsOfItemsToScrape As Collection
Set urlsOfItemsToScrape = GetUrlsOfItemsToScrape()
Dim rowWriteIndex As Long
rowWriteIndex = 1 ' Skip row 1/headers
Dim destinationSheet As Worksheet
Set destinationSheet = ThisWorkbook.Worksheets("Sheet1")
destinationSheet.Cells.ClearContents
Dim columnIndexes As Collection
Set columnIndexes = New Collection
Dim urlOfItem As Variant
For Each urlOfItem In urlsOfItemsToScrape
Debug.Print urlOfItem
Dim htmlOfItemPage As MSHTML.HTMLDocument
Set htmlOfItemPage = GetHtmlForItem(webClient, urlOfItem)
Dim nameValuePairs As Collection
If IsItemAWheelOnly(htmlOfItemPage) Then
Set nameValuePairs = CreateNameValuePairsForWheelOnly(htmlOfItemPage)
Else
Set nameValuePairs = CreateNameValuePairsForWheelAndTirePackage(htmlOfItemPage)
End If
rowWriteIndex = rowWriteIndex + 1
Dim nameValuePair As Variant
For Each nameValuePair In nameValuePairs
Dim columnWriteIndex As Long
columnWriteIndex = GetColumnIndexOfHeader(columnIndexes, nameValuePair(0))
If columnWriteIndex = 0 Then
columnWriteIndex = columnIndexes.Count + 1
columnIndexes.Add columnWriteIndex, Key:=nameValuePair(0)
destinationSheet.Cells(1, columnWriteIndex).Value = nameValuePair(0)
End If
destinationSheet.Cells(rowWriteIndex, columnWriteIndex).Value = nameValuePair(1)
Next nameValuePair
DoEvents
Next urlOfItem
End Sub
Private Function CreateNameValuePairsForWheelAndTirePackage(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_div")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Set CreateNameValuePairsForWheelAndTirePackage = outputCollection
End Function
Private Function CreateNameValuePairsForWheelOnly(ByVal htmlOfItemPage As MSHTML.HTMLDocument) As Collection
' Should return a collection of 2-element arrays (where each 2-element array
' represents a name-value pair).
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim targetElement As MSHTML.IHTMLElement
Set targetElement = htmlOfItemPage.getElementById("itemTitle")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Title", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("mm-saleOrgPrc")
If targetElement Is Nothing Then
Set targetElement = htmlOfItemPage.getElementById("prcIsum")
Debug.Assert Not (targetElement Is Nothing)
End If
outputCollection.Add CreateNameValuePair("Price", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("descItemNumber")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("eBay Item Number", targetElement.innerText)
Set targetElement = htmlOfItemPage.getElementById("desc_wrapper_ctr")
Debug.Assert Not (targetElement Is Nothing)
outputCollection.Add CreateNameValuePair("Description HTML", targetElement.innerHTML)
Dim itemSpecifics As MSHTML.IHTMLTableSection
Set itemSpecifics = htmlOfItemPage.querySelector(".itemAttr tbody")
Debug.Assert Not (itemSpecifics Is Nothing)
Dim tableRow As MSHTML.IHTMLTableRow
For Each tableRow In itemSpecifics.Rows
Debug.Assert 0 = (tableRow.Cells.Length Mod 2)
Dim columnIndex As Long
For columnIndex = 0 To (tableRow.Cells.Length - 1) Step 2
Debug.Assert InStr(1, tableRow.Cells(columnIndex).className, "attrLabels", vbBinaryCompare)
outputCollection.Add CreateNameValuePair(tableRow.Cells(columnIndex).innerText, tableRow.Cells(columnIndex + 1).innerText)
Next columnIndex
Next tableRow
Set CreateNameValuePairsForWheelOnly = outputCollection
End Function
Private Function CreateNameValuePair(ByVal someName As String, ByVal someValue As String) As String()
Dim outputArray(0 To 1) As String
outputArray(0) = someName
outputArray(1) = someValue
CreateNameValuePair = outputArray
End Function
Private Function GetColumnIndexOfHeader(ByVal columnIndexes As Collection, ByVal header As String) As Long
' Should return a 1-based column index associated with "header".
' If "header" does not exist within collection, 0 is returned.
On Error Resume Next
GetColumnIndexOfHeader = columnIndexes(header)
On Error GoTo 0
End Function
此代码运行缓慢的原因有很多:
Application.Calculation
或Application.ScreenUpdating
。一旦您认为代码可以正常工作,就需要在If GetUrlsOfItemsToScrape.Count > 10 Then Exit Do
函数中摆脱这一行GetUrlsOfItemsToScrape
。否则,您将不会刮掉所有物品。
我将DoEvents
留在了Do
循环中,以保持响应速度(也许会牺牲一些性能)