使用post数据和xlmlhttp

时间:2018-02-12 06:59:04

标签: excel vba post web-scraping

我仍在尝试了解有关抓取的更多信息,我可以设计一个能够让我获得所需结果的代码。

这是代码

Sub Test()
    Dim e          As Variant
    Dim ie          As Object
    Dim ulElem      As Object
    Dim liElem      As Object
    Dim anchElem    As Object
    Dim dt          As Date
    Dim lDay        As Integer
    Dim lMnth      As Integer
    Dim lYear      As Integer
    Dim r          As Long

    Set ie = CreateObject("InternetExplorer.Application")
    dt = Date - 2
    lDay = Day(dt)
    lMnth = Month(dt)
    lYear = Year(dt)

    With ie
        .Visible = False
        .Navigate "http://www.handelsregisterbekanntmachungen.de/?aktion=suche#Ergebnis"

        Do: DoEvents: Loop Until .readyState = 4

        For Each e In ie.document.getElementsByTagName("select")
            If Len(e.innerText) = 56 Then
                e.selectedIndex = lDay
            ElseIf Len(e.innerText) = 18 Then
                e.selectedIndex = lMnth
            ElseIf Left(e.innerText, 8) = "----2000" Then
                e.selectedIndex = lYear - 1999
            ElseIf InStr(e.innerText, "Alle Bekanntmachungen") > 0 Then
                e.selectedIndex = 1
            End If
        Next e

        For Each e In ie.document.getElementsByTagName("input")
            If e.Value = "Suche starten" Then e.Click: Exit For
        Next e
        Do: DoEvents: Loop Until .readyState = 4
        Application.Wait Now() + TimeValue("00:00:05")

        If InStr(ie.document.body.innerHTML, "Es wurden 0 Treffer gefunden.") > 0 Then
            MsgBox "No Results Found", vbExclamation: Exit Sub
        Else
            For Each ulElem In ie.document.getElementsByTagName("b")
                For Each liElem In ulElem.getElementsByTagName("li")
                    Set anchElem = liElem.getElementsByTagName("a")
                    If anchElem.Length > 0 Then
                        r = r + 1
                        Cells(r, 1) = Mid(anchElem.Item(0).innerText, 11)
                    End If
                Next liElem
            Next ulElem
        End If
    End With
End Sub

但是,为了尝试更多地了解XMLHTTP请求,我正在寻找一种方法来获得相同的结果但不使用IE。所以我认为使用XMLHTTP会更有效率,特别是我可以在为搜索过程设置所需的选项后看到后期数据。

1 个答案:

答案 0 :(得分:0)

看一下下面的例子:

Option Explicit

Sub Test()

    Dim sState As String
    Dim sCourt As String
    Dim dtFrom As Date
    Dim dtTill As Date
    Dim sSubject As String
    Dim sOrder As String
    Dim oStates As Object
    Dim oCourts As Object
    Dim oSubjects As Object
    Dim oOrders As Object
    Dim sStateCode As String
    Dim sCourtId As String
    Dim sSubjectVal As String
    Dim sOrderVal As String
    Dim aData

    ' Set query data
    sState = ""
    sCourt = ""
    dtFrom = DateSerial(2018, 2, 11)
    dtTill = DateSerial(2018, 2, 11)
    sSubject = ""
    sOrder = "Aktenzeichen"

    ' Retrieve options
    GetOptions oStates, oCourts, oSubjects, oOrders

    ' Validate query parameters
    If Not oStates.Exists(sState) Then MsgBox "State valid values:" & vbCrLf & vbCrLf & """" & Join(oStates.Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oCourts(oStates(sState)).Exists(sCourt) Then MsgBox "Court valid values:" & vbCrLf & vbCrLf & """" & Join(oCourts(oStates(sState)).Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oSubjects.Exists(sSubject) Then MsgBox "Subject valid values:" & vbCrLf & vbCrLf & """" & Join(oSubjects.Keys(), """" & vbCrLf & """") & """": Exit Sub
    If Not oOrders.Exists(sOrder) Then MsgBox "Order valid values:" & vbCrLf & vbCrLf & """" & Join(oOrders.Keys(), """" & vbCrLf & """") & """": Exit Sub

    ' Request data
    sStateCode = oStates(sState)
    sCourtId = oCourts(sStateCode)(sCourt)
    sSubjectVal = oSubjects(sSubject)
    sOrderVal = oOrders(sOrder)
    GetData sStateCode, sCourt, sCourtId, dtFrom, dtTill, sSubjectVal, sOrderVal, aData

    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        Output2DArray .Cells(1, 1), aData
        .Columns.AutoFit
    End With

    MsgBox "Completed"

End Sub

Sub GetOptions(oStates As Object, oCourts As Object, oSubjects As Object, oOrders As Object)

    Dim sContent As String
    Dim aTmp0
    Dim aTmp1
    Dim vItem
    Dim oTmp
    Dim i As Long

    ' Retrieve request options from search page
    XmlHttpRequest "GET", "https://www.handelsregisterbekanntmachungen.de/?aktion=suche", "", "", "", sContent

    ' Get each state and code
    ExtractOptions sContent, "land", oStates

    ' Get courts with courts ids for each federal state
    Set oCourts = CreateObject("Scripting.Dictionary")
    For Each vItem In oStates.Items()
        ' Put courts and ids into temp dictionary
        Set oTmp = CreateObject("Scripting.Dictionary")
        If vItem <> "" Then
            ' Extract the whole JS array statement with courts names
            ParseResponse "BundeslandArray\['" & vItem & "'\]=new Array\(('[^']*'(?:,'[^']*')*)\);", sContent, aTmp0, False
            ' Extract each court name into temp array
            ParseResponse "'([^']*)'", (aTmp0(0)), aTmp0, False
            ' Extract the whole JS array statement with courts ids
            ParseResponse "BundeslandArrayId\['" & vItem & "'\]=new Array\(('[^']*'(?:,'[^']*')*)\);", sContent, aTmp1, False
            ' Extract each court id into temp array
            ParseResponse "'([^']*)'", (aTmp1(0)), aTmp1, False
            For i = 0 To UBound(aTmp0)
                oTmp(DecodeHTMLEntities((aTmp0(i)))) = DecodeHTMLEntities((aTmp1(i)))
            Next
        End If
        ' Add dummy item
        oTmp("") = ""
        ' Put courts-ids for the state code into dictionary
        Set oCourts(vItem) = oTmp
    Next
    ' Add dummy item
    oStates("") = ""

    ' Get subjects
    ExtractOptions sContent, "gegenstand", oSubjects
    ' Add dummy item
    oSubjects("") = "0"

    ' Get sort order types
    ExtractOptions sContent, "order", oOrders

End Sub

Sub GetData(sStateCode As String, sCourt As String, sCourtId As String, dtFrom As Date, dtTill As Date, sSubjectVal As String, sOrderVal As String, aData)

    Dim i As Long
    Dim oQuery As Object
    Dim sQuery As String
    Dim sContent As String

    ' Set query parameters
    Set oQuery = CreateObject("Scripting.Dictionary")
    With oQuery
        .Add "suchart", "uneingeschr"
        .Add "button", "Start search"
        .Add "land", sStateCode
        .Add "gericht", sCourtId
        .Add "gericht_name", sCourt
        .Add "seite", ""
        .Add "l", ""
        .Add "r", ""
        .Add "all", "false"
        .Add "vt", Day(dtFrom)
        .Add "vm", Month(dtFrom)
        .Add "vj", Year(dtFrom)
        .Add "bt", Day(dtTill)
        .Add "bm", Month(dtTill)
        .Add "bj", Year(dtTill)
        .Add "fname", ""
        .Add "fsitz", ""
        .Add "rubrik", ""
        .Add "az", ""
        .Add "gegenstand", sSubjectVal
        .Add "anzv", "alle"
        .Add "order", sOrderVal
    End With
    sQuery = EncodeQueryParams(oQuery)

    ' Retrieve search results
    XmlHttpRequest "POST", _
        "https://www.handelsregisterbekanntmachungen.de/de/index.php?aktion=suche", _
        Array( _
            Array("Content-Type", "application/x-www-form-urlencoded"), _
            Array("Content-Length", Len(sQuery) _
            ) _
        ), _
        sQuery, _
        "", _
        sContent

    ' Parse response
    sContent = Replace(sContent, "<br>", vbCrLf)
    ParseResponse "<li[^>]*><a[^>]*?href=""javascript:NeuFenster\('([^']*)'\)""[^>]*>([^<]*)<ul[^>]*>([\s\S]*?)</ul>", sContent, aData, False
    For i = 0 To UBound(aData, 1)
        aData(i)(0) = "http://www.handelsregisterbekanntmachungen.de/en/skripte/hrb.php?" & aData(i)(0)
    Next

End Sub

Sub ExtractOptions(sContent As String, sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=" & sName & "[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(DecodeHTMLEntities((vItem(1)))) = DecodeHTMLEntities(Replace(vItem(0), """", ""))
    Next

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    ' With CreateObject("MSXML2.ServerXMLHTTP")
        ' .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp0 = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp0, sSubMatch
                Next
                PushItem aData, aTmp0
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function DecodeHTMLEntities(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    DecodeHTMLEntities = oDiv.innerText

End Function

Function EncodeQueryParams(oParams As Object) As String

    Dim aParams
    Dim i As Long

    aParams = oParams.Keys()
    For i = 0 To UBound(aParams)
        aParams(i) = EncodeUriComponent((aParams(i))) & "=" & EncodeUriComponent((oParams(aParams(i))))
    Next
    EncodeQueryParams = Join(aParams, "&")

End Function

Function EncodeUriComponent(strText As String) As String

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
    End If
    EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub