我想找到一个集合并在其上循环以加载每个页面。我正在尝试:
i
For i = 1 To "number of last page (column D)"
| https://voronezh.leroymerlin.ru/catalogue/krovelnye-mastiki/ | | -7 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/vodostok/ | | -125 | 2 |
| https://voronezh.leroymerlin.ru/catalogue/rozetki-i-vyklyuchateli/ | | -898 | 10 |
| https://voronezh.leroymerlin.ru/catalogue/ramki-dlya-rozetok-i-vyklyuchateley/ | | -398 | 5 |
| https://voronezh.leroymerlin.ru/catalogue/nakladki-dlya-rozetok-i-vyklyuchateley/ | | -35 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/podrozetniki/ | | -11 | 1 |
| https://voronezh.leroymerlin.ru/catalogue/silovye-kabeli/ | | -175 | 2 |
我尝试使用以下代码循环所有url,但无法正常工作。
Sub get_data()
Dim wsSheet As Worksheet, REZULTSheet As Worksheet, Rows As Long, http As New XMLHTTP60, html As New HTMLDocument
Dim i As Integer, topic As HTMLHtmlElement, link As Variant, x As Long, num_pages As Variant, links As Variant
Set wsSheet = Sheets("URLs_2")
Set REZULTSheet = Sheets("Products")
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).row
links = wsSheet.Range("A1:A" & Rows)
num_pages = wsSheet.Range("D1:D" & Rows)
REZULTSheet.Select
For i = 1 To ??? 'num_pages?
Application.ScreenUpdating = False
With http
For Each link In links
.Open "GET", link & "?display=90&sortby=1&page=" & i, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
Do: DoEvents: Loop Until .readyState = 4
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
With topic.getElementsByClassName("product-name")
If .Length Then x = x + 1: Cells(x, 2) = .item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then Cells(x, 3) = .item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then Cells(x, 1) = .item(1).innerText
End With
Next topic
Next link
End With
Next i
End Sub
由于大多数代码都是重复的,所以有什么方法可以运行循环以减少代码量。
答案 0 :(得分:2)
您真正想做的是一个重构(我可能会随时间添加)以减少代码复杂性。现在,要解决循环问题,请参见如何生成两个包含url和页数的一维数组,然后对页数使用1个循环,并使用当前循环计数器值索引到另一个数组中。
摆脱自动实例化,使用类名进行限定,使用Long而不是Integer,使用工作表而不是工作表。
在重构中使用数组,以便更快地写到表中。
Option Explicit
Public Sub GetData()
Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Dim i As Long, topic As MSHTML.HTMLHtmlElement
Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
Application.ScreenUpdating = False
On Error GoTo errHand
Set http = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
x = x + 1
With topic.getElementsByClassName("product-name")
If .Length Then rezultSheet.Cells(x, 2) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then rezultSheet.Cells(x, 3) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then rezultSheet.Cells(x, 1) = .Item(1).innerText
End With
Next topic
html.body.innerHTML = vbNullString
Next
End With
Next
errHand:
Application.ScreenUpdating = True
End Sub
重构(恐怕不是我的最好,而是一个起点。例如,您将需要一些较低级别的错误处理)。它是更多的代码,但是开始将单独的逻辑任务分配到它们自己的sub / func中:
Option Explicit
Public wsSheet As Worksheet, rezultSheet As Worksheet
Public Sub GetData()
Dim http As MSXML2.XMLHTTP60, rowCount As Long, pageCounts(), links()
Application.ScreenUpdating = False
On Error GoTo errHand
Set http = New MSXML2.XMLHTTP60
Set wsSheet = Sheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
With rezultSheet.Cells
.ClearContents
.ClearFormats
End With
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
GetResults http, pageCounts, links
errHand:
Application.ScreenUpdating = True
If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
End Sub
Public Sub GetResults(ByVal http As MSXML2.XMLHTTP60, ByRef pageCounts(), ByRef links())
Dim i As Long, numPages As Long
Dim page As Long, html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
WriteOutResults html
html.body.innerHTML = vbNullString
Next
End With
Next
End Sub
Public Sub WriteOutResults(ByVal html As MSHTML.HTMLDocument)
Dim topic As MSHTML.HTMLHtmlElement, results()
Dim r As Long, productCards As Object
Set productCards = html.getElementsByClassName("ui-product-card__info")
ReDim results(1 To productCards.Length, 1 To 3)
For Each topic In productCards
r = r + 1
With topic.getElementsByClassName("product-name")
If .Length Then results(r, 2) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then results(r, 3) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text") '
If .Length Then results(r, 1) = .Item(1).innerText
End With
Next topic
Dim lastRow As Long
lastRow = GetLastRow(rezultSheet)
rezultSheet.Cells(lastRow + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
答案 1 :(得分:2)
采用@QHar解决方案的代码,以便仅在计算机内存中工作...
Option Explicit
Private Sub GetDataInMemory()
Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Dim i As Long, topic As MSHTML.HTMLHtmlElement
Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
Dim Data() As String, k As Long
Set http = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
Set rezultSheet = ThisWorkbook.Worksheets("Products")
rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))
ReDim Data(2, k)
For i = LBound(pageCounts) To UBound(pageCounts)
numPages = pageCounts(i)
With http
For page = 1 To numPages
.Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
html.body.innerHTML = .responseText
For Each topic In html.getElementsByClassName("ui-product-card__info")
With topic.getElementsByClassName("product-name")
If .Length Then Data(1, x) = .Item(0).innerText
End With
With topic.getElementsByClassName("price-section-inner")
If .Length Then Data(2, x) = .Item(0).innerText
End With
With topic.getElementsByClassName("madein__text")
If .Length Then Data(0, x) = .Item(0).innerText
End With
x = x + 1: ReDim Preserve Data(2, x)
Next topic
html.body.innerHTML = vbNullString
Next
End With
Next
rezultSheet.Range("A1:C" & x - 1).Value = Application.Transpose(Data)
End Sub