我有一个大概150个URL的列表(所有URL都到Swappa.com/xxxxxx),我想从中获取信息。我已经弄清楚了如何从每个列表中提取第一个列表,但是我希望扩展它以提取每个URL的所有列表并将它们加载到一个表中(彼此顶部)。
以下示例:
URL示例:https://swappa.com/mobile/buy/apple-iphone-6s/sprint 要么 https://swappa.com/mobile/buy/samsung-galaxy-s6/t-mobile
Public Sub ListingInfo()
Dim cell As Range
With ThisWorkbook.Worksheets("Sheet1")
For Each cell In .Range("A1", .Cells(.Rows.count, 1).End(xlUp))
Dim Document As MSHTML.HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", cell.Value, False
.send
Set Document = New MSHTML.HTMLDocument
Document.body.innerHTML = .responseText
End With
cell.Offset(0, 1).Value = Document.querySelector(".text-nowrap").innerText
cell.Offset(0, 2).Value =
Document.querySelector("condition_label").innerText
cell.Offset(0, 3).Value =
Document.querySelector("price").innerText
cell.Offset(0, 4).Value =
Document.querySelector("storage_label").innerText
cell.Offset(0, 5).Value =
Document.querySelector("color_label").innerText
Next
End With
End Sub
答案 0 :(得分:3)
该站点使用CloudFlare防御DDoS / DoS。这意味着您几乎肯定会以xmlhttp失败,因为重定向会很快发生,并且在URL循环期间您将无法获得预期的内容。
您还需要处理未找到的页面,如果发生这种情况,CloudFlare重定向延迟。
以下内容满足了您的需求,尽管您可能希望在某些测试中添加A列中实际上有url。我假设URL在sheet1的A列中,并且信息从B列开始写出。我使用数组来不仅可以加快处理速度,而且还可以处理错误并提供命令来满足以下事实:并非您想要的所有信息都可能出现在每个页面/每个列表中。
Option Explicit
Public Sub GetResults()
Dim html As HTMLDocument, page As Long, ws As Worksheet, index As Long
Dim results(), URLs(), ie As InternetExplorer, t As Date
Const MAX_WAIT_SEC As Long = 15
Application.ScreenUpdating = False
Set ie = New InternetExplorer
Set html = New HTMLDocument
Set ws = ThisWorkbook.Worksheets("Sheet1")
URLs = Application.Transpose(ws.Range("A1:A2").Value)
ReDim results(1 To UBound(URLs))
With ie
.Visible = True
For page = LBound(URLs) To UBound(URLs)
If InStr(URLs(page), "http") > 0 Then
.Navigate2 URLs(page)
While .Busy Or .readyState < 4: DoEvents: Wend
t = Timer
Do
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While .document.querySelectorAll("#section_main").Length = 0
If Not InStr(.document.body.innerHTML, "404 - Sorry, we couldn't find what you were looking for. ") > 0 And _
Not InStr(.document.body.innerHTML, "No listings currently for sale") > 0 Then
index = index + 1
results(index) = GetInfo(.document, URLs(page))
Else
ReDim Preserve results(1 To UBound(results) - 1)
End If
End If
Next
.Quit
End With
Dim i As Long, j As Long, rowCounter As Long, arr()
rowCounter = 1
Dim headers()
headers = Array("URL", "Seller", "Feedback", "Condition", "Color", "Storage", "Price", "Headline")
ws.Cells(1, 2).Resize(1, UBound(headers) + 1) = headers
For i = LBound(results) To UBound(results)
arr = results(i)
For j = LBound(arr) To UBound(arr)
rowCounter = rowCounter + 1
ws.Cells(rowCounter, 2).Resize(1, UBound(arr(j)) + 1) = arr(j)
Next
Next
Application.ScreenUpdating = True
End Sub
Public Function GetInfo(ByVal html As HTMLDocument, ByVal url As String) As Variant
Dim dict As Object, results(), nodeList, numSellers As Long, counter As Long
Dim listings As Object, listing As Object, ws As Worksheet
Set dict = CreateObject("Scripting.Dictionary")
dict.Add "URL", url
dict.Add "Seller", vbNullString
dict.Add "Feedback", vbNullString
dict.Add "Condition", vbNullString
dict.Add "Color", vbNullString
dict.Add "Storage", vbNullString
dict.Add "Price", vbNullString
dict.Add "Headline", vbNullString
Set listings = html.getElementById("section_main").getElementsByClassName("listing_row listing_None listing_None")
ReDim results(1 To listings.Length)
For Each listing In listings
counter = counter + 1
On Error Resume Next
dict("Seller") = listing.querySelector(".text-nowrap").innerText
dict("Feedback") = listing.querySelector("[data-value]").getAttribute("data-value")
dict("Condition") = listing.querySelector(".condition_label").innerText
dict("Color") = listing.querySelector(".color_label").innerText
dict("Storage") = listing.querySelector(".storage_label").innerText
dict("Price") = listing.querySelector(".price").innerText
dict("Headline") = listing.querySelector(".headline.hidden-xs.text-nowrap").innerText
On Error GoTo 0
results(counter) = dict.Items
Set dict = ClearDict(dict)
Next
GetInfo = results
End Function
Public Function ClearDict(ByRef dict As Object) As Object
Dim key As Variant
For Each key In dict
If key <> "URL" Then dict(key) = vbNullString
Next
Set ClearDict = dict
End Function
参考:
答案 1 :(得分:1)
以下脚本应该会从第一个网址获取您要获取的内容。
Public Sub GetListingInfo()
Const Url$ = "https://swappa.com/mobile/buy/apple-iphone-6s/sprint"
Dim HTTP As New XMLHTTP60, HTML As New HTMLDocument
Dim post As HTMLDivElement, I&
With HTTP
.Open "GET", Url, False
.send
HTML.body.innerHTML = .responseText
End With
For Each post In HTML.getElementsByClassName("listing_row")
I = I + 1: Cells(I, 1) = post.querySelector(".text-nowrap span").innerText
Cells(I, 2) = post.querySelector(".condition_label").innerText
Cells(I, 3) = post.querySelector(".price").innerText
Cells(I, 4) = post.querySelector(".storage_label").innerText
Cells(I, 5) = post.querySelector(".color_label").innerText
Next post
End Sub
添加参考:
Microsoft xml, v6.0
Microsoft HTML Object Library