我正在尝试使用此代码抓取网站以提取姓名和联系人...
String.Join(" , ", i)
Dim xx '这里有错误 设置xx = htmlDoc.getElementsByClassName(“ ldb-contact-summary”)
Sub Test()
Dim htmlDoc As Object
Dim htmlDoc2 As Object
Dim elem As Variant
Dim tag As Variant
Dim dns As String
Dim pageSource As String
Dim pageSource2 As String
Dim url As String
Dim row As Long
row = 2
dns = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", dns, True
.send
While .readyState <> 4: DoEvents: Wend
If .statusText <> "OK" Then
MsgBox "ERROR" & .Status & " - " & .statusText, vbExclamation
Exit Sub
End If
pageSource = .responseText
End With
Set htmlDoc = CreateObject("htmlfile")
htmlDoc.body.innerHTML = pageSource
尝试使用此行时
Set htmlDoc = Nothing
Set htmlDoc2 = Nothing
End Sub
我收到错误消息“对象不支持该属性或方法”(438) 因为我不太擅长抓取问题,您能帮我吗?
答案 0 :(得分:3)
要获取姓名及其对应的电话号码,可以尝试以下代码段:
Sub GetProfileInfo()
Const URL$ = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page="
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim post As HTMLDivElement, R&, P&
For p = 1 To 3 'put here the highest number you wanna traverse
With Http
.Open "GET", URL & p, False
.send
Html.body.innerHTML = .responseText
End With
For Each post In Html.getElementsByClassName("ldb-contact-summary")
With post.querySelectorAll(".ldb-contact-name a")
If .Length Then R = R + 1: Cells(R, 1) = .item(0).innerText
End With
With post.getElementsByClassName("ldb-phone-number")
If .Length Then Cells(R, 2) = .item(0).innerText
End With
Next post
Next p
End Sub
添加到库中以执行上述脚本的参考:
Microsoft xml, v6.0
Microsoft Html Object Library
答案 1 :(得分:1)
正如您在上面的注释中提到的所有页面一样,我将使用一个类来保存XMLHTTP对象,并为它提供提取数据的方法,同时并入一个方法来查找结果页面的数量并将其循环。经过测试,我得到了251行结果。
注意:通过调试发现,保留SetRequestHeader会对您造成人工验证请求。删除它意味着XMLHTTP方法有效。无论有没有,对我来说都是如此。
clsHTTP类
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal url As String) As String
Dim sResponse As String
With http
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
GetString = sResponse
End With
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
Set names = html.querySelectorAll("[class*='ldb-contact-name']")
Set telNums = html.querySelectorAll(".ldb-phone-number")
ReDim namesArray(0 To names.Length - 1)
ReDim telsArray(0 To telNums.Length - 1)
For i = 0 To names.Length - 1
namesArray(i) = names.item(i).innerText
telsArray(i) = telNums.item(i).innerText
Next
GetInfo = Array(namesArray, telsArray)
End Function
标准模块1
Option Explicit
Public Sub GetReviewData()
Dim sResponse As String, html As HTMLDocument, http As clsHTTP
Dim numPages As Long, pageNum As Long, url As String
Dim results As Collection, item As Variant, ws As Worksheet
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/"
Set http = New clsHTTP
Set html = New HTMLDocument
Set results = New Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
With html
.body.innerHTML = http.GetString(url)
numPages = .querySelectorAll("[data-idx]").item(html.querySelectorAll("[data-idx]").Length - 2).innerText
results.Add http.GetInfo(html)
If numPages > 1 Then
For pageNum = 2 To numPages
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
.body.innerHTML = http.GetString(url)
results.Add http.GetInfo(html)
Next
End If
Dim numResults As Long
If results.Count > 0 Then
Application.ScreenUpdating = False
For Each item In results
numResults = UBound(item(0)) + 1
With ws
.Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
.Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
End With
Next
Application.ScreenUpdating = True
End If
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
硒:
Option Explicit
Public Sub GetReviewData()
Dim html As HTMLDocument
Dim numPages As Long, pageNum As Long, url As String
Dim results As Collection, item As Variant, ws As Worksheet
Dim d As WebDriver, elements As WebElements
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=1&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
Set html = New HTMLDocument
Set results = New Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set d = New ChromeDriver
With d
.Start "Chrome"
.get url
Set elements = .FindElementsByCss("[data-idx]")
numPages = elements(elements.Count - 1).Text
html.body.innerHTML = .PageSource
results.Add GetInfo(html)
If numPages > 1 Then
For pageNum = 2 To numPages
Application.Wait Now + TimeSerial(0, 0, 2)
url = "https://www.zillow.com/detroit-mi/real-estate-agent-reviews/?page=" & pageNum & "&showAdvancedItems=false®ionID=17762&locationText=Detroit%20MI"
.get url
html.body.innerHTML = .PageSource
results.Add GetInfo(html)
Next
End If
Dim numResults As Long
If results.Count > 0 Then
Application.ScreenUpdating = False
For Each item In results
numResults = UBound(item(0)) + 1
With ws
.Cells(GetLastRow(ws, 1) + 1, 1).Resize(numResults, 1) = Application.Transpose(item(0))
.Cells(GetLastRow(ws, 2) + 1, 2).Resize(numResults, 1) = Application.Transpose(item(1))
End With
Next
Application.ScreenUpdating = True
End If
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Public Function GetInfo(ByVal html As HTMLDocument) As Variant
Dim names As Object, telNums As Object, i As Long, namesArray(), telsArray()
Set names = html.querySelectorAll("[class*='ldb-contact-name']")
Set telNums = html.querySelectorAll(".ldb-phone-number")
ReDim namesArray(0 To names.Length - 1)
ReDim telsArray(0 To telNums.Length - 1)
For i = 0 To names.Length - 1
namesArray(i) = names.item(i).innerText
telsArray(i) = telNums.item(i).innerText
Next
GetInfo = Array(namesArray, telsArray)
End Function