优化刮取和循环

时间:2019-12-30 12:11:39

标签: excel vba loops web-scraping

我想找到一个集合并在其上循环以加载每个页面。我正在尝试:

  1. 查找页数-完成(D列)
  2. 循环链接(第A列),并将最后一页的末页号设为i
    • 我知道我可以使用以下内容循环到第1页的下一页:
      For i = 1 To "number of last page (column D)"

| https://voronezh.leroymerlin.ru/catalogue/krovelnye-mastiki/                          |       | -7    | 1     |
| https://voronezh.leroymerlin.ru/catalogue/vodostok/                                   |       | -125  | 2     |
| https://voronezh.leroymerlin.ru/catalogue/rozetki-i-vyklyuchateli/                    |       | -898  | 10    |
| https://voronezh.leroymerlin.ru/catalogue/ramki-dlya-rozetok-i-vyklyuchateley/        |       | -398  | 5     |
| https://voronezh.leroymerlin.ru/catalogue/nakladki-dlya-rozetok-i-vyklyuchateley/     |       | -35   | 1     |
| https://voronezh.leroymerlin.ru/catalogue/podrozetniki/                               |       | -11   | 1     |
| https://voronezh.leroymerlin.ru/catalogue/silovye-kabeli/                             |       | -175  | 2     |

enter image description here

我尝试使用以下代码循环所有url,但无法正常工作。

Sub get_data()
Dim wsSheet As Worksheet, REZULTSheet As Worksheet, Rows As Long, http As New XMLHTTP60, html As New HTMLDocument
Dim i As Integer, topic As HTMLHtmlElement, link As Variant, x As Long, num_pages As Variant, links As Variant
Set wsSheet = Sheets("URLs_2")
Set REZULTSheet = Sheets("Products")
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).row
links = wsSheet.Range("A1:A" & Rows)
num_pages = wsSheet.Range("D1:D" & Rows)
REZULTSheet.Select

For i = 1 To ??? 'num_pages?
    Application.ScreenUpdating = False
    With http
        For Each link In links
            .Open "GET", link & "?display=90&sortby=1&page=" & i, False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
            .send
            Do: DoEvents: Loop Until .readyState = 4
            html.body.innerHTML = .responseText
            For Each topic In html.getElementsByClassName("ui-product-card__info")
                With topic.getElementsByClassName("product-name")
                    If .Length Then x = x + 1: Cells(x, 2) = .item(0).innerText
                End With
                With topic.getElementsByClassName("price-section-inner")
                    If .Length Then Cells(x, 3) = .item(0).innerText
                End With
                With topic.getElementsByClassName("madein__text") '
                    If .Length Then Cells(x, 1) = .item(1).innerText
                End With
            Next topic        
        Next link
    End With
Next i
End Sub

由于大多数代码都是重复的,所以有什么方法可以运行循环以减少代码量。

Current test file

2 个答案:

答案 0 :(得分:2)

您真正想做的是一个重构(我可能会随时间添加)以减少代码复杂性。现在,要解决循环问题,请参见如何生成两个包含url和页数的一维数组,然后对页数使用1个循环,并使用当前循环计数器值索引到另一个数组中。

摆脱自动实例化,使用类名进行限定,使用Long而不是Integer,使用工作表而不是工作表。

在重构中使用数组,以便更快地写到表中。

Option Explicit

Public Sub GetData()
    Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    Dim i As Long, topic As MSHTML.HTMLHtmlElement
    Dim x As Long, pageCounts(), numPages As Long, page As Long, links()

    Application.ScreenUpdating = False

    On Error GoTo errHand

    Set http = New MSXML2.XMLHTTP60
    Set html = New MSHTML.HTMLDocument
    Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send

                html.body.innerHTML = .responseText

                For Each topic In html.getElementsByClassName("ui-product-card__info")
                    x = x + 1
                    With topic.getElementsByClassName("product-name")
                        If .Length Then rezultSheet.Cells(x, 2) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("price-section-inner")
                        If .Length Then rezultSheet.Cells(x, 3) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("madein__text") '
                        If .Length Then rezultSheet.Cells(x, 1) = .Item(1).innerText
                    End With
                Next topic

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
errHand:
    Application.ScreenUpdating = True
End Sub

重构(恐怕不是我的最好,而是一个起点。例如,您将需要一些较低级别的错误处理)。它是更多的代码,但是开始将单独的逻辑任务分配到它们自己的sub / func中:

Option Explicit
Public wsSheet As Worksheet, rezultSheet As Worksheet

Public Sub GetData()

    Dim http As MSXML2.XMLHTTP60, rowCount As Long, pageCounts(), links()

    Application.ScreenUpdating = False

    On Error GoTo errHand

    Set http = New MSXML2.XMLHTTP60
    Set wsSheet = Sheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    With rezultSheet.Cells
        .ClearContents
        .ClearFormats
    End With

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    GetResults http, pageCounts, links

errHand:
    Application.ScreenUpdating = True
    If Err.Number <> 0 Then Debug.Print Err.Number, Err.Description
End Sub


Public Sub GetResults(ByVal http As MSXML2.XMLHTTP60, ByRef pageCounts(), ByRef links())
    Dim i As Long, numPages As Long
    Dim page As Long, html As MSHTML.HTMLDocument

    Set html = New MSHTML.HTMLDocument

    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send

                html.body.innerHTML = .responseText

                WriteOutResults html

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
End Sub


Public Sub WriteOutResults(ByVal html As MSHTML.HTMLDocument)
    Dim topic As MSHTML.HTMLHtmlElement, results()
    Dim r As Long, productCards As Object

    Set productCards = html.getElementsByClassName("ui-product-card__info")

    ReDim results(1 To productCards.Length, 1 To 3)

    For Each topic In productCards
        r = r + 1
        With topic.getElementsByClassName("product-name")
            If .Length Then results(r, 2) = .Item(0).innerText
        End With
        With topic.getElementsByClassName("price-section-inner")
            If .Length Then results(r, 3) = .Item(0).innerText
        End With
        With topic.getElementsByClassName("madein__text") '
            If .Length Then results(r, 1) = .Item(1).innerText
        End With
    Next topic

    Dim lastRow As Long

    lastRow = GetLastRow(rezultSheet)
    rezultSheet.Cells(lastRow + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

Public Function GetLastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    GetLastRow = sh.Cells.Find(What:="*", _
                               After:=sh.Range("A1"), _
                               Lookat:=xlPart, _
                               LookIn:=xlFormulas, _
                               SearchOrder:=xlByRows, _
                               SearchDirection:=xlPrevious, _
                               MatchCase:=False).Row
    On Error GoTo 0
End Function

答案 1 :(得分:2)

采用@QHar解决方案的代码,以便仅在计算机内存中工作...

Option Explicit

Private Sub GetDataInMemory()
    Dim wsSheet As Worksheet, rezultSheet As Worksheet, rowCount As Long
    Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
    Dim i As Long, topic As MSHTML.HTMLHtmlElement
    Dim x As Long, pageCounts(), numPages As Long, page As Long, links()
    Dim Data() As String, k As Long

    Set http = New MSXML2.XMLHTTP60
    Set html = New MSHTML.HTMLDocument
    Set wsSheet = ThisWorkbook.Worksheets("URLs_2")
    Set rezultSheet = ThisWorkbook.Worksheets("Products")

    rowCount = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
    links = Application.Transpose(wsSheet.Range("A1:A" & rowCount)) 'turn into 1D array
    pageCounts = Application.Transpose(wsSheet.Range("D1:D" & rowCount))

    ReDim Data(2, k)
    For i = LBound(pageCounts) To UBound(pageCounts)
        numPages = pageCounts(i)

        With http
            For page = 1 To numPages
                .Open "GET", links(i) & "?display=90&sortby=1&page=" & page, False
                .setRequestHeader "User-Agent", "Mozilla/5.0"
                .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
                .send
                html.body.innerHTML = .responseText
                For Each topic In html.getElementsByClassName("ui-product-card__info")
                    With topic.getElementsByClassName("product-name")
                        If .Length Then Data(1, x) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("price-section-inner")
                        If .Length Then Data(2, x) = .Item(0).innerText
                    End With
                    With topic.getElementsByClassName("madein__text")
                        If .Length Then Data(0, x) = .Item(0).innerText
                    End With
                    x = x + 1: ReDim Preserve Data(2, x)
                Next topic

                html.body.innerHTML = vbNullString
            Next
        End With
    Next
    rezultSheet.Range("A1:C" & x - 1).Value = Application.Transpose(Data)
End Sub