从HTML TD和Tr中提取值

时间:2012-01-08 09:13:22

标签: regex vba excel-vba html-parsing excel

我有一些HTML源代码,我从网站上获取选项报价。 (请参阅下文)

在tr中提取各种文本值并根据执行价格存储在集合中的最佳方式是什么(在这种情况下,4700中间有4700

有些人建议使用正则表达式,而其他人建议使用html解析器。我在VBA这样做,所以最好的方式是什么?

<!--<td><a href="javascript:popup1('','','1')">Quote</a></td>

<td><a href="javascript:popup1('','','','','CE')"><img src="/images/print3.gif"></a>



</td>-->





                    <td><a href="javascript:chartPopup('NIFTY', 'OPTIDX', '25JAN2012', '4700.00','CE','S&P CNX NIFTY');"><img src="/live_market/resources/images/grficon.gif" /></a></td>

                        <td class="ylwbg"> 2,935,500</td>

                        <td class="ylwbg"> 27,550</td>

                        <td class="ylwbg"> 12,458</td>


                        <td class="ylwbg"> 23.79</td>

                        <!-- End-->

                        <td class="ylwbg">





                            <a href="/live_market/dynaContent/live_watch/get_quote/GetQuoteFO.jsp?underlying=NIFTY&instrument=OPTIDX&strike=4700.00&type=CE&expiry=25JAN2012" target="_blank"> 139.25</a>



                        </td>

                        <!--*Net Change*-->



                        <td class="ylwbg" Style="color:Red;"> -7.35</td>



                        <td class="ylwbg"> 200</td>

                        <td class="ylwbg"> 139.15</td>

                        <td class="ylwbg"> 142.45</td>

                        <td class="ylwbg"> 200</td>

                        <td class="grybg"><a href="/live_market/dynaContent/live_watch/option_chain/optionDates.jsp?symbol=NIFTY&instrument=OPTIDX&strike=4700.00"><b>4700.00</b></a></td>

                        <td class="nobg"> 1,300</td>

                        <td class="nobg"> 76.00</td>

                        <td class="nobg"> 79.00</td>

                        <td class="nobg"> 1,350</td>



                        <!--*Net Change*-->



                            <td class="nobg" Style="color:Red;"> -1.55</td>





                        <td class="nobg">



                            <!-- <a href="javascript:popup1('NIFTY','OPTIDX','25JAN2012','4700.00','PE')"> 76.00</a> -->



                            <a href="/live_market/dynaContent/live_watch/get_quote/GetQuoteFO.jsp?underlying=NIFTY&instrument=OPTIDX&strike=4700.00&type=PE&expiry=25JAN2012" target="_blank"> 76.00</a>







                        </td>



                        <td class="nobg"> 26.33</td>



                        <td class="nobg"> 32,772</td>

                        <td class="nobg"> 103,700</td>



                        <td class="nobg"> 5,123,300</td>



                        <td><a href="javascript:chartPopup('NIFTY', 'OPTIDX', '25JAN2012', '4700.00','PE','S&P CNX NIFTY');"><img src="/live_market/resources/images/grficon.gif" /></a></td>



<!--<td><a href="javascript:popup1('','','1')">Quote</a></td>

<td><a href="javascript:popup1('','','','','PE')"><img src="/images/print3.gif"></a></td>-->



                    </tr>

1 个答案:

答案 0 :(得分:2)

经过一番摆弄后,我使用

派生了一个正则表达式/ VBA解决方案
  1. XMLHTTP访问网站(更改strSite以适应)
  2. 正则表达式以获取所需的数字
  3. 包含20条记录的变量数组,然后将数字转储到活动工作表
  4. output 查看源HTML以查找正则表达式

    调用选项有一个共同的起始和结束字符串,用于分隔10个值,但有三个不同的字符串

    1. 每个记录匹配的字符串1-4,7-10 <td class="ylwbg"> X </td>
    2. 字符串6在 Style
    3. 之前的>前面有X(和其他文字)
    4. 字符串5包含更长的<a href text X </a>
    5. 正则表达式 .Pattern = "(<tdclass=""ylwbg"")(Style.+?){0,1}>(.+?)(<\/td>)" 提取所有需要的字符串,但稍后需要在字符串5上进一步工作

      Put 选项以<td class="nobg"开头,因此很难通过获得积分1-3的正则表达式提取这些选项

      enter image description here 实际代码

          Sub GetTxt()
          Dim objXmlHTTP As Object
          Dim objRegex As Object
          Dim objRegMC As Object
          Dim objRegM As Object
          Dim strResponse As String
          Dim strSite As String
          Dim lngCnt As Long
          Dim strTemp As String
          Dim X(1 To 20, 1 To 10)
          X(1, 1) = "OI"
          X(1, 2) = "Chng in vol"
          X(1, 3) = "Volume"
          X(1, 4) = "IV"
          X(1, 5) = "LTP"
          X(1, 6) = "Net Chg"
          X(1, 7) = "Bid Qty"
          X(1, 8) = "Bid Price"
          X(1, 9) = "Ask Price"
          X(1, 10) = "Ask Qnty"
      
          Set objXmlHTTP = CreateObject("MSXML2.XMLHTTP")
          strSite = "http://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionDates.jsp?symbol=NIFTY&instrument=OPTIDX&strike=4700.00"
      
          On Error GoTo ErrHandler
          With objXmlHTTP
              .Open "GET", strSite, False
              .Send
              If .Status = 200 Then strResponse = .ResponseText
          End With
          On Error GoTo 0
      
          Set objRegex = CreateObject("vbscript.regexp")
          With objRegex
              '*cleaning regex* to remove all spaces
              .Pattern = "[\xA0\s]+"
              .Global = True
              strResponse = .Replace(strResponse, vbNullString)
              .Pattern = "(<tdclass=""ylwbg"")(Style.+?){0,1}>(.+?)(<\/td>)"
              If .Test(strResponse) Then
                  lngCnt = 20
                  Set objRegMC = .Execute(strResponse)
                  For Each objRegM In objRegMC
                      lngCnt = lngCnt + 1
                      If Right$(objRegM.submatches(2), 2) <> "a>" Then
                          X(Int((lngCnt - 1) / 10), IIf(lngCnt Mod 10 > 0, lngCnt Mod 10, 10)) = objRegM.submatches(2)
                      Else
                      'Get submatches of the form <a href="/live_market/dynaContent/live_watch/get_quote/GetQuoteFO.jsp?underlying=NIFTY&instrument=OPTIDX&strike=4700.00&type=CE&expiry=23FEB2012" target="_blank"> 206.40</a>
                          strTemp = Val(Right(objRegM.submatches(2), Len(objRegM.submatches(2)) - InStrRev(objRegM.submatches(2), """") - 1))
                          X(Int((lngCnt - 1) / 10), IIf(lngCnt Mod 10 > 0, lngCnt Mod 10, 10)) = strTemp
                      End If
                  Next
              Else
                  MsgBox "Parsing unsuccessful", vbCritical
              End If
          End With
          Set objRegex = Nothing
          Set objXmlHTTP = Nothing
          [a1].Resize(UBound(X, 1), UBound(X, 2)) = X
          Exit Sub
      ErrHandler:
          MsgBox "Site not accessible"
          If Not objXmlHTTP Is Nothing Then Set objXmlHTTP = Nothing
      End Sub