Web表没有通过VBA获取正确的数据

时间:2014-03-08 20:03:28

标签: vba excel-vba excel

使用后续代码,我可以从此网页http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html

中获取价格表

但是从另一个页面here这个表没有被提取......尽管这两个页面是相同的。我无法弄清楚缺少的地方。

对此的任何帮助都非常明显。

Sub TableExample()
    Dim IE As Object
    Dim doc As Object
    Dim strURL As String

    strURL = "http://www.idealo.de/preisvergleich/OffersOfProduct/143513.html"


    Set IE = CreateObject("InternetExplorer.Application")
    With IE
        .navigate strURL
        Do Until .readyState = 4: DoEvents: Loop
        Do While .Busy: DoEvents: Loop
        Set doc = IE.document
        GetAllTables doc

        .Quit
    End With

End Sub

Sub GetAllTables(doc As Object)


    Dim ws As Worksheet
    Dim rng As Range
    Dim tbl As Object
    Dim rw As Object
    Dim cl As Object
    Dim tabno As Long
    Dim nextrow As Long
    Dim i As Long

    Set ws = Sheets("Sheet1")


    For Each tbl In doc.getElementsByTagName("TABLE")
        tabno = tabno + 1
        nextrow = nextrow + 1
        Set rng = ws.Range("B" & nextrow)
        rng.Offset(, -1) = "Table " & tabno
        If tabno = 5 Then
        For Each rw In tbl.Rows
            colno = 5

            For Each cl In rw.Cells
                If colno = 5 And nextrow > 5 Then
                    Set classColl = doc.getElementsByClassName("shop")
                    Set imgTgt = classColl(nextrow - 6).getElementsByTagName("img")
                    rng.Value = imgTgt(0).getAttribute("alt")
                Else
                    rng.Value = cl.innerText
                End If
                Set rng = rng.Offset(, 1)
                i = i + 1
                colno = colno + 1
            Next cl
            nextrow = nextrow + 1
            Set rng = rng.Offset(1, -i)
            i = 0
        Next rw
        End If

    Next tbl
End Sub

3 个答案:

答案 0 :(得分:0)

表格编号在两个网址之间发生变化。表5是您对第一个网址感兴趣的表,但表6是第二个网址中感兴趣的表。然而,两个感兴趣的表具有相同的id(“offer-list”),因此不是查找第五个表,而是调整代码以查找具有ID“offer-list”的表

变化

  If tabno = 5 Then

  If InStr(1, tbl.outerhtml, "Produktbezeichnung des Shops", vbTextCompare) > 0 Then

这会让你走近。第二个网页上还有其他更改,您当前的代码并没有完全处理 - 但就像我说这会让你接近。

答案 1 :(得分:0)

我已使用

更改了If tabno = 5 Then
For Each tbl In doc.getElementsByTagName("table")
       ' tabno = tabno + 1
       If tbl.className = "orangebox_rowborder" Then

感谢@Ron指导我这个...感谢一大堆Dude

答案 2 :(得分:0)

以下内容适用于每个URL,因此与IE浏览器无关,因此比您当前使用的方法更健壮,并且速度更快。

有关冗长的代码说明,请参见here

Option Explicit

'Tools > References > HTML Object Library
Public Sub GetTable()
    Const URL = "https://www.idealo.de/preisvergleich/OffersOfProduct/1866742_-335-billingham.html" '<==Change this
    Dim sResponse As String, listItems As Object, html As HTMLDocument, headers()
    headers = Array("product_id", "product_name", "product_price", "product_category", "currency", "spr", "shop_name", "delivery_time", "shop_rating", "position", "free_return", "approved_shipping")

    Application.ScreenUpdating = False

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", URL, False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
    Set html = New HTMLDocument
    With html
        .body.innerHTML = sResponse
        Set listItems = .getElementsByClassName("productOffers-listItemOfferPrice")
    End With

    Dim currentItem As Long
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        For currentItem = 0 To listItems.Length - 1
            Dim tempString As String, columnValues() As String
            tempString = TidyString(GetTransactionInfo(listItems(currentItem).outerHTML), "&#\d+;")
            columnValues = GetColumnValues(tempString, headers)
            .Cells(currentItem + 2, 1).Resize(1, UBound(columnValues) + 1) = columnValues
        Next currentItem
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetTransactionInfo(ByVal inputString) As String
    'Split to get just the transaction items i.e. Headers and associated values
    GetTransactionInfo = Split(Split(inputString, """transaction"",")(1), "}")(0)
End Function

Public Function TidyString(ByVal inputString As String, ByVal matchPattern As String) As String
    'Extract transaction info
    'Use regex to find these unwanted strings and replace pattern e.g. &#\d+;
    'Example inputString

    Dim regex As Object, tempString As String
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Global = True
        .MultiLine = True
        .IgnoreCase = False
        .Pattern = matchPattern
    End With

    If regex.test(inputString) Then
        TidyString = regex.Replace(inputString, vbNullString)
    Else
        TidyString = inputString
    End If
End Function

Public Function GetColumnValues(ByVal inputString As String, ByVal headers As Variant) As Variant
    ' Example input string "product_id": "143513","product_name": "Canon 500D Nahlinse 72mm","product_price": "128.0","product_category": "26570","currency": "EUR","spr": "cfd","shop_name": "computeruniverse.net","delivery_time": "long","shop_rating": "100","position": "1","free_return": "14","approved_shipping": "false"
    ' Extract just the inner string value of each header e.g. 143513
    Dim arr() As String, currentItem As Long, tempString As String
    tempString = inputString
    For currentItem = LBound(headers) To UBound(headers)
        tempString = TidyString(tempString, Chr$(34) & headers(currentItem) & Chr$(34) & ":")
    Next currentItem
    arr = Split(Replace$(tempString, Chr$(34), vbNullString), ",")
    GetColumnValues = arr
End Function