VBA没有完全获取Web表

时间:2014-02-20 12:23:52

标签: excel internet-explorer vba excel-vba web-scraping

我需要从this site获取价格表。

为此,我已经开发了一些代码:

Sub TableExample()
    Dim IE As Object
    Dim doc As Object
    Dim strURL As String

    strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"

    ' replace with URL of your choice

    Set IE = CreateObject("InternetExplorer.Application")
    With IE
         '.Visible = True

        .navigate strURL
        Do Until .readyState = 4: DoEvents: Loop
            Do While .Busy: DoEvents: Loop
                Set doc = IE.document
                GetAllTables doc

                .Quit
            End With
        End Sub

       Sub GetAllTables(doc As Object)

     ' get all the tables from a webpage document, doc, and put them in a new worksheet

    Dim ws As Worksheet
    Dim rng As Range
    Dim tbl As Object
    Dim rw As Object
    Dim cl As Object
    Dim tabno As Long
    Dim nextrow As Long
    Dim I As Long

    Set ws = Sheets("Sheet1")


    For Each tbl In doc.getElementsByTagName("TABLE")
        tabno = tabno + 1
        nextrow = nextrow + 1
        Set rng = ws.Range("B" & nextrow)
        'rng.Offset(, -1) = "Table " & tabno
        If tabno = 5 Then

        For Each rw In tbl.Rows
            colno = 6
            For Each cl In rw.Cells
                If colno = 5 And nextrow < 1 Then
                    Set classColl = doc.getElementsByClassName("shop")
                    Set imgTgt = classColl(nextrow - 2).getElementsByTagName("img").getElementsByClassName("btn-goto-shop")
                    rng.Value = imgTgt(0).getAttribute("alt")

                Else
                    rng.Value = cl.innerText
                End If
                Set rng = rng.Offset(, 1)
                I = I + 1
                colno = colno + 1
            Next cl
            nextrow = nextrow + 1
            Set rng = rng.Offset(1, -I)
            I = 0
        Next rw
        End If
    Next tbl

    ws.Cells.ClearFormats

End Sub

通过此代码,我可以获得所需的结果,但不会获取具有商店名称的最后一列。任何人都可以帮我这个吗?

2 个答案:

答案 0 :(得分:1)

如果检查页面的HTML,则可以看到className为productOffers-listItemOfferPrice的元素具有必需的信息。有比您可能意识到的更多的信息。参见底部的代码输出。


Example from HTML


在主子GetTable中,我使用XHR请求获取页面HTML并将其存储在HTML文档中。

执行.getElementsByClassName("productOffers-listItemOfferPrice")获取所有商品信息时,您需要解析每个元素.outerHTML


辅助函数GetTransactionInfo使用split函数仅获取.outerHTML的产品信息部分。返回的字符串类似于以下示例示例:

"&#10;&#9;&#9;&#9;"product_id": &#9;&#9;&#9;"143513",&#10;&#9;&#9;&#9;"product_name": ..."

辅助函数TidyString采用inputString和regex模式,通过匹配不需要的字符串并将其替换为空文字字符串(vbNullString),应用regex模式匹配来整理产品信息字符串。


正则表达式模式1:

例如,第一个正则表达式模式"&#\d+;"除去了字符串中带有数字的所有&#:

Regex1 Try it


正则表达式模式2:

第二个正则表达式模式Chr$(34) & headers(currentItem) & Chr$(34) & ":"从字符串中删除产品标头信息,即仅获取值。

例如它需要"product_id": "143513"并返回"143513"

Regex2 Try it


示例页面信息(示例)

Sample page view


示例代码输出:

Sample output


VBA代码:

Option Explicit

'Tools > References > HTML Object Library
Public Sub GetTable()

    Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
    headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")

    Application.ScreenUpdating = False

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://www.idealo.de/preisvergleich/OffersOfProduct/143513.html", False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
    Set html = New HTMLDocument
    With html
        .body.innerHTML = sResponse
        Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
    End With

    Dim currentItem As Long
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        For currentItem = 0 To listItems.Length - 1
            Dim tempString As String, columnValues() As String
            tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
            columnValues = GetColumnValues(tempString, headers)
            .Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
        Next currentItem
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetTransactionInfo(ByVal inputString) As String
    'Split to get just the transaction items i.e. Headers and associated values
    GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function

Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
    'Extract transaction info
    'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
    'Example inputString

    Dim regex As Object, tempString As String
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = matchPattern
    End With

    If regex.TEST(inputString) Then
        TidyString = regex.Replace(inputString, vbNullString)
    Else
        TidyString = inputString
    End If
End Function

Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
    ' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
    ' Extract just the inner string value of each header e.g. 143513
    Dim arr() As String, currentItem As Long, tempString As String
    tempString = inputString
    For currentItem = LBound(headers) To UBound(headers)
        tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
    Next currentItem
    arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
    GetColumnValues = arr
End Function

答案 1 :(得分:-1)

这是我正在运行的修改后的代码

Sub GetAllTables(doc As Object)

 ' get all the tables from a webpage document, doc, and put them in a new worksheet

Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long

Set ws = Sheets("Sheet1")


For Each tbl In doc.getElementsByTagName("TABLE")
    tabno = tabno + 1
    nextrow = nextrow + 1
    Set rng = ActiveSheet.Range("B" & nextrow)
    'rng.Offset(, -1) = "Table " & tabno
    If tabno = 5 Then

For Each rw In tbl.Rows
        colno = 1
        For Each cl In rw.Cells
            If colno = 5 Then
                rng.Value = doc.getElementsByClassName("shop")(nextrow - 6).getElementsByTagName("img")(1).getAttribute("alt")
            Else
                rng.Value = cl.innerText
            End If
            Set rng = rng.Offset(, 1)
            I = I + 1
            colno = colno + 1
        Next cl
        nextrow = nextrow + 1
        Set rng = rng.Offset(1, -I)
        I = 0
    Next rw
    End If
Next tbl

ws.Cells.ClearFormats
End Sub