我需要从this site获取价格表。
为此,我已经开发了一些代码:
Sub TableExample()
Dim IE As Object
Dim doc As Object
Dim strURL As String
strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"
' replace with URL of your choice
Set IE = CreateObject("InternetExplorer.Application")
With IE
'.Visible = True
.navigate strURL
Do Until .readyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.document
GetAllTables doc
.Quit
End With
End Sub
Sub GetAllTables(doc As Object)
' get all the tables from a webpage document, doc, and put them in a new worksheet
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Set ws = Sheets("Sheet1")
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
'rng.Offset(, -1) = "Table " & tabno
If tabno = 5 Then
For Each rw In tbl.Rows
colno = 6
For Each cl In rw.Cells
If colno = 5 And nextrow < 1 Then
Set classColl = doc.getElementsByClassName("shop")
Set imgTgt = classColl(nextrow - 2).getElementsByTagName("img").getElementsByClassName("btn-goto-shop")
rng.Value = imgTgt(0).getAttribute("alt")
Else
rng.Value = cl.innerText
End If
Set rng = rng.Offset(, 1)
I = I + 1
colno = colno + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -I)
I = 0
Next rw
End If
Next tbl
ws.Cells.ClearFormats
End Sub
通过此代码,我可以获得所需的结果,但不会获取具有商店名称的最后一列。任何人都可以帮我这个吗?
答案 0 :(得分:1)
如果检查页面的HTML,则可以看到className为productOffers-listItemOfferPrice
的元素具有必需的信息。有比您可能意识到的更多的信息。参见底部的代码输出。
在主子GetTable
中,我使用XHR请求获取页面HTML并将其存储在HTML文档中。
执行.getElementsByClassName("productOffers-listItemOfferPrice")
获取所有商品信息时,您需要解析每个元素.outerHTML
。
辅助函数GetTransactionInfo
使用split函数仅获取.outerHTML
的产品信息部分。返回的字符串类似于以下示例示例:
" 			"product_id": 			"143513", 			"product_name": ..."
辅助函数TidyString
采用inputString和regex模式,通过匹配不需要的字符串并将其替换为空文字字符串(vbNullString
),应用regex模式匹配来整理产品信息字符串。>
正则表达式模式1:
例如,第一个正则表达式模式"&#\d+;"
除去了字符串中带有数字的所有&#:
正则表达式模式2:
第二个正则表达式模式Chr$(34) & headers(currentItem) & Chr$(34) & ":"
从字符串中删除产品标头信息,即仅获取值。
例如它需要"product_id": "143513"
并返回"143513"
。
示例页面信息(示例)
示例代码输出:
VBA代码:
Option Explicit
'Tools > References > HTML Object Library
Public Sub GetTable()
Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.idealo.de/preisvergleich/OffersOfProduct/143513.html", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
End With
Dim currentItem As Long
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For currentItem = 0 To listItems.Length - 1
Dim tempString As String, columnValues() As String
tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
columnValues = GetColumnValues(tempString, headers)
.Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
Next currentItem
End With
Application.ScreenUpdating = True
End Sub
Public Function GetTransactionInfo(ByVal inputString) As String
'Split to get just the transaction items i.e. Headers and associated values
GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function
Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
'Extract transaction info
'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
'Example inputString
Dim regex As Object, tempString As String
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = matchPattern
End With
If regex.TEST(inputString) Then
TidyString = regex.Replace(inputString, vbNullString)
Else
TidyString = inputString
End If
End Function
Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
' Extract just the inner string value of each header e.g. 143513
Dim arr() As String, currentItem As Long, tempString As String
tempString = inputString
For currentItem = LBound(headers) To UBound(headers)
tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
Next currentItem
arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
GetColumnValues = arr
End Function
答案 1 :(得分:-1)
这是我正在运行的修改后的代码
Sub GetAllTables(doc As Object)
' get all the tables from a webpage document, doc, and put them in a new worksheet
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Set ws = Sheets("Sheet1")
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ActiveSheet.Range("B" & nextrow)
'rng.Offset(, -1) = "Table " & tabno
If tabno = 5 Then
For Each rw In tbl.Rows
colno = 1
For Each cl In rw.Cells
If colno = 5 Then
rng.Value = doc.getElementsByClassName("shop")(nextrow - 6).getElementsByTagName("img")(1).getAttribute("alt")
Else
rng.Value = cl.innerText
End If
Set rng = rng.Offset(, 1)
I = I + 1
colno = colno + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -I)
I = 0
Next rw
End If
Next tbl
ws.Cells.ClearFormats
End Sub