在Excel VBA上通过Msxml2.ServerXMLHTTP.6.0设置Web表刮板

时间:2017-03-19 23:32:25

标签: vba excel-vba msxml mshtml msxml6

我需要做一个网络数据抓取工具。

  1. 我需要登录网站:用户,密码,点击登录按钮
  2. 点击第二个按钮
  3. 等待页面加载,这是相关表格。该表是一个呼叫日志,动态添加新内容,因此它总是令人耳目一新。
  4. 我想从表格内容中排除表单,并限制粘贴到Excel的行。
  5. 我通过InternetExplorer.Application代码使其工作,但我需要切换到MSXML2.XMLHTTP代码,因为它非常慢。

    使用InternetExplorer.Application版本:

     Sub extractTablesData()
     'we define the essential variables
    
     Dim IE As Object, obj As Object
     Dim r As Integer, c As Integer, t As Integer
     Dim elemCollection As Object
    
    
     'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
     Set IE = CreateObject("InternetExplorer.Application")
    
     With IE
     .Silent = True
     .Visible = True
     .navigate ("https://www.clickphone.ro")
    
     ' we ensure that the web page downloads completely before we fill the form automatically
     While IE.readyState <> 4
     DoEvents
     Wend
    Application.Wait Now + TimeValue("00:00:03")
    Set HTMLDoc = IE.document
     HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
     HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
     'Login Button Click               
     With IE.document
    
        Set elems = .getElementsByTagName("a")
        For Each e In elems
    
            If (e.getAttribute("class") = "orange_button") Then
                e.Click
                Exit For
            End If
    
        Next e
    
    End With
    
     'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
     While IE.readyState <> 4
     DoEvents
     Wend
    Set iedoc = IE.document
    
    Set elems = iedoc.getElementsByClassName("black")(12)
        elems.Click
    
     ' again ensuring that the web page loads completely before we start scraping data
     While IE.readyState <> 4
     DoEvents
     Wend
     Application.Wait Now + TimeValue("00:00:05")
     Set iedoc = IE.document
    
    'Clearing any unnecessary or old data in Sheet1
    
     ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents
    
    'Scrapping Data and past to Sheet1
     Set elemCollection = IE.document.getElementsByTagName("table")
    
        For t = 0 To (elemCollection.Length - 1)
            For r = 0 To (elemCollection(t).Rows.Length - 1)
                For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
                    ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
                Next c
            Next r
        Next t
    
     End With
    
     ' cleaning up memory
     Set IE = Nothing
    
     End Sub
    

    这是我对MSXMLHTTP的尝试:

    Option Explicit
     'reference to Microsoft Internet Controls
     'reference to Microsoft HTML Object Library
    
    Sub Web_Table_Option_One()
    Dim xml    As Object
    Dim html   As Object
    Dim objTable As Object
    Dim result As String
    Dim lRow As Long
    Dim lngTable As Long
    Dim lngRow As Long
    Dim lngCol As Long
    Dim ActRw As Long
    
    Set xml = CreateObject("MSXML2.XMLHTTP.6.0")
    
    Set html = CreateObject("htmlFile")
    
    With xml
    .Open "POST", "https://www.clickphone.ro/login.html", False
    .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
    .send "userName=USER&password=XXXXxxxxXxxxxXXX"
    .Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False 
    .setRequestHeader "Content-type", "text/xml"
    .send
    End With
    
    html.body.innerHTML = xml.responseText
    
    Set objTable = html.getElementsByTagName("table")
     For lngTable = 0 To objTable.Length - 1
            For lngRow = 0 To objTable(lngTable).Rows.Length - 1
                For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                    ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
                Next lngCol
            Next lngRow
            ActRw = ActRw + objTable(lngTable).Rows.Length + 1
        Next lngTable
    End Sub
    

    HTML源代码:

    对于用户,传递,登录按钮:

    <form action="/login.html" id="toploginform" name="toploginform" method="post">
                                                                          <script>
                                                function processLoginForm(){
                                                    with (document.toploginform) {
                                                        if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
                                                        document.getElementById('toploginform').submit();
                                                    }
                                                }
                                            </script>
    
                                                                                <fieldset>
                                                <input name="userlogin" type="hidden" id="userlogin" value="true" />
                                                <span class="text">
                                                <input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
                                                </span> <span class="text">
                                                <input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
                                                </span> 
                                                <input name="authcode" type="hidden" id="authcode" value="false" />
                                                <span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
                                                 <span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
                                                    <input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
                                                    <label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
                                            </span>
                                            </fieldset>                         
                                                                    </form>
    

    表页按钮:

    <br />&nbsp;<img src="/images/sageata_orange.gif" width="7" height="8" />&nbsp;<a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>
    

    表源代码:

    <table class="TabelDate" cellspacing="0">
      <thead>
        <tr>
          <th width="130">Data</th>
          <th>Sursa</th>
          <th>Destinatie</th>
          <th>Durata</th>
          <th class="ultima">Status</th>
        </tr>
      </thead>
      <tr class="u">    <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
                <td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr>  <tr class="gri">    <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>
    

2 个答案:

答案 0 :(得分:0)

我设法部分解决了我的问题。现在我可以使用XmlHttp登录并检索我需要的表。我在这里发布了工作代码,所以每个人都可以使用它(我没有为这段代码获得任何学分,我是在不同论坛的帮助下完成的)

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
    For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

现在我有两个问题...... 如何从父“表”获取子表“表”(我在后面的表是在一个更大的表中,请参见下面的源代码),我想只得到第一行,但不包括“表格”行(它是一个href链接) Source Code

我怎么能连续得到这个(这个表是动态的,每当有人打电话给我时,它会更新,第一行,正在不断更新)

答案 1 :(得分:0)

我的工作代码2.0版:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
ThisWorkbook.Sheets("LogClickPhone").Range("A2") = objTable(1).Rows(1).Cells(0).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("B2") = objTable(1).Rows(1).Cells(1).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("C2") = objTable(1).Rows(1).Cells(2).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("D2") = objTable(1).Rows(1).Cells(3).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("E2") = objTable(1).Rows(1).Cells(4).innerText
End Sub

我设法只获得我需要的行,但它很慢,需要38.5秒才能完成。我想我最好使用MSXML2.DOMDocument.6.0结构来获取我需要的文本。但我不知道该怎么做。 题: 我如何自动化这段代码,使其每60秒左右运行一次? TX