我正在尝试从网页上抓取数据。它不适用于所有格式都与类相同的网站,并标记所有内容。我收到一个错误消息“下标超出范围”,并且在“ ReDim results(1 To rowCount,1 To numColumns)”代码上突出显示了。
我在页面上得到了答案:Web Scraping by TagName 该代码适用于https://www.neighborhoodselfstorage.net/self-storage-ocean-city-md-88769
现在我正尝试将相同的代码用于:https://www.stormore.net/self-storage-seattle-wa-101616#utm_source=GoogleLocal&utm_medium=WRLocal&utm_campaign=101616
请任何人帮助解决此问题。
Case.find({},{case_id:1}).then(recs=>{
recs.forEach(function(rec){
console.log("x");
CustomerAgreement.findOne({case_id:rec.case_id}).then(t=>{
console.log("y");
});
});
console.log("z");
});
答案 0 :(得分:1)
我认为您想要类似以下的内容。
初始错误:
我认为您的最初错误部分是由于该URL未返回您在浏览器中使用相同URL时看到的HTML所致。我看到的内容在响应中不包含这些列表,因此行数为0;因此,您在此行上的错误subscript out of range
错误:ReDim results(1 To rowCount, 1 To numColumns)
因此,URL更改为:
https://www.stormore.net/self-storage-seattle-wa-101616
下一步:
检查html,以了解如何生成列表行,我们注意到列表用.main li.pure-g
干净地表示。需要向li添加一个附加类,以过滤掉不需要的信息。我们只想循环包含感兴趣信息的行。
Set listings = html.querySelectorAll(".main li.pure-g")
最后:
在检查html时,我们注意到并非所有行都有所有感兴趣的项目,例如offer1
和offer2
,因此我们将尝试访问On Error Resume Next
,On Error GoTo 0
中某些项目的尝试包裹起来,以掩盖错误并在输出的该列中输出“”。 / p>
VBA:
Option Explicit
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String
Const URL As String = "https://www.stormore.net/self-storage-seattle-wa-101616"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
html.body.innerHTML = s
Dim headers(), results(), listings As Object, amenities As String
headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
Set listings = html.querySelectorAll(".main li.pure-g")
Dim rowCount As Long, numColumns As Long, r As Long, c As Long
Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long
rowCount = listings.Length
numColumns = UBound(headers) + 1
ReDim results(1 To rowCount, 1 To numColumns)
Dim html2 As HTMLDocument
Set html2 = New HTMLDocument
For item = 0 To listings.Length - 1
r = r + 1
html2.body.innerHTML = listings.item(item).innerHTML
'size,description, amenities,specials offer1 offer2, rate type, price
results(r, 1) = Trim$(html2.querySelector(".size").innerText)
results(r, 2) = Trim$(html2.querySelector(".description").innerText)
On Error Resume Next
Set icons = html2.querySelectorAll("i[title]")
ReDim amenitiesInfo(0 To icons.Length - 1)
For icon = 0 To icons.Length - 1
amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
Next
amenities = Join$(amenitiesInfo, ", ")
results(r, 3) = amenities
results(r, 4) = html2.querySelector(".offer1").innerText
results(r, 5) = html2.querySelector(".offer2").innerText
On Error GoTo 0
results(r, 6) = html2.querySelector(".rate-label").innerText
results(r, 7) = html2.querySelector(".price").innerText
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub