Html解析器只想在特定标记中保留特定内容

时间:2012-11-06 12:16:03

标签: php html html-parsing

我想解析html数据。我想删除标签中我不关心的所有数据以及标签本身。我只想保留特定span类中的数据。但是也要删除span标签。

是否可能已经在线预设了一个脚本?如果不是最好的方法,谢谢。

1 个答案:

答案 0 :(得分:1)

好的,为什么不呢?我深情地记得所有文件似乎在脑海中旋转的时间,就像一个醉汉的房间。

有很多不同的方法可以解决这个问题,我在上面的评论中只提了一个。我能看到的三种方式是:

  1. 获取跨度列表,检查适当的类,如果类是目标类,则抓取文本。
  2. 假设只有目标跨度将具有tgt类。获取具有此类名的元素列表,获取列表中所有元素的文本。
  3. 使用DOM函数获取tgt className的所有跨度列表。获取列表中所有元素的文本。
  4. 使用Google搜索页面的结果快速展示示例:

    使用Javascript:

    // getElementsByTagName, getElementsByClass - both return a NodeList
    // it is accessed in the same way as an array - with the [] operators, but it's
    // not an array object - this is a function that allows us to still iterate through it
    // in much the same way.
    function forEachNode(nodeList, func)
    {
        var i, n = nodeList.length;
        for (i=0; i<n; i++)
        {
            func(nodeList[i], i, nodeList);
        }
    }
    
    function grabTextFromSpans()
    {
        //
        //  1. - assume elements other than span will possess the target className.
        //       onlty get elements that are (a) a span and (b) have the right className
        var spanList = document.getElementsByTagName('span');
        var validList = [], stringList = [];
    
        forEachNode(spanList, function(curItem){ if (curItem.className == 'st') validList.push(curItem);} );
    
        validList.forEach( function(elem){ stringList.push(elem.innerText);} );
        var mStr = '';
        stringList.forEach(function(elem){mStr += elem + "\n";});
        alert(mStr);
    
    
        //
        //  2. - assume only the target elements will possess the target className
        //
        var mStr = '';
        var tgtList = document.getElementsByClassName('st');
        forEachNode(tgtList, function(curElem){mStr += curElem.innerText + "\n";} );
        alert(mStr);
    
    
        //
        // 3. - assume that tgt elements are spans and have a specific className
        //
        var mStr = '';
        var tgtList = document.querySelectorAll('span.st');
        forEachNode(tgtList, function(curElem){mStr += curElem.innerText + "\n";} );
        alert(mStr);
    }
    

    HTML:

    <ol eid="S-WYUPKzGI7JmAWAr4CwBw" id="rso"><!--m--><li class="g"><div class="vsc" sig="u1z">  <div data-ved="0CCQQkgowAA">  <div data-ved="0CCUQkQowAA"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://bebraw.github.com/colorjs/" class="l" onmousedown="return rwt(this,'','','','1','AFQjCNFuHzNJryRAA9QBjqC1250RDZAMCQ','','0CCYQFjAA',null,event)">Colorjs by bebraw</a></h3><div class="s"><div class="f kv"><cite>bebraw.github.com/colorjs/</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:eSrglCqgBAkJ:bebraw.github.com/colorjs/+&amp;cd=1&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','1','AFQjCNEm0e9Xr7p0eOh7TWkQ81JoqWQNfQ','','0CCcQIDAA',null,event)">Cached</a></span></div><div class="esc slp" id="poS0" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">colorjs provides simple API that may be used to create colors (<em>RGBA</em>, <em>HSVA</em>, <em>HSLA</em>) and perform various color related operations (conversions and such).<br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="kDj">  <div data-ved="0CCkQkgowAQ">  <div data-ved="0CCoQkQowAQ"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="https://github.com/bebraw/colorjs" class="l" onmousedown="return rwt(this,'','','','2','AFQjCNFRF9AfGrmbF5E6IhyKId0ztwA7sQ','','0CCsQFjAB',null,event)">bebraw/colorjs · GitHub</a></h3><div class="s"><div class="f kv"><cite>https://github.com/bebraw/colorjs</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:WtA6gOF2ZqEJ:https://github.com/bebraw/colorjs+&amp;cd=2&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','2','AFQjCNHqdXwTalbWocgzhnaRA1EKTkyMzQ','','0CCwQIDAB',null,event)">Cached</a></span></div><div class="esc slp" id="poS1" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">colorjs provides simple API that may be used to create colors (<em>RGBA</em>, <em>HSVA</em>, <em>HSLA</em>) and perform various color related operations (conversions and such).<br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="PWh">  <div data-ved="0CC4QkgowAg">  <div data-ved="0CC8QkQowAg"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://www.pygame.org/docs/ref/color.html" class="l" onmousedown="return rwt(this,'','','','3','AFQjCNFFK5xPcE_2Yl9D8NhHmBbb_Y8UJg','','0CDAQFjAC',null,event)">color - Pygame Documentation</a></h3><div class="s"><div class="f kv"><cite>www.pygame.org/docs/ref/color.html</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:zXwLCILEH14J:www.pygame.org/docs/ref/color.html+&amp;cd=3&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','3','AFQjCNGwtfM-FoMMusa6z3-GjN68_lw5BQ','','0CDEQIDAC',null,event)">Cached</a>&nbsp;-&nbsp;<a href="/search?hl=en&amp;safe=off&amp;q=related:www.pygame.org/docs/ref/color.html+hlsa+hsva+rgba&amp;tbo=1&amp;sa=X&amp;ei=S-WYUPKzGI7JmAWAr4CwBw&amp;ved=0CDIQHzAC">Similar</a></span></div><div class="esc slp" id="poS2" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">Color.<em>hsva</em> - Gets or sets the <em>HSVA</em> representation of the Color. Gets or sets the <b>...</b> Color.<em>hsla</em> - Gets or sets the <em>HSLA</em> representation of the Color. Gets or sets <b>...</b> Color.normalize - Returns the normalized <em>RGBA</em> values of the Color. Returns the <b>...</b><br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="8Ft">  <div data-ved="0CDQQkgowAw">  <div data-ved="0CDUQkQowAw"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://softimage.wiki.softimage.com/sdkdocs/sicppsdk/html/classXSI_1_1MATH_1_1CColor4f.html" class="l" onmousedown="return rwt(this,'','','','4','AFQjCNFu-Fgfc5DxRWUiz6rFQcmtu8VxQw','','0CDYQFjAD',null,event)">CColor4f Class Reference</a></h3><div class="s"><div class="f kv"><cite>softimage.wiki.softimage.com/.../classXSI_1_1MATH_1_1CColor4f....</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:a3m9eQIkVpcJ:softimage.wiki.softimage.com/sdkdocs/sicppsdk/html/classXSI_1_1MATH_1_1CColor4f.html+&amp;cd=4&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','4','AFQjCNE2tLroL_B4pfp_-yHsgN77HgygkA','','0CDcQIDAD',null,event)">Cached</a></span></div><div class="esc slp" id="poS3" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">Detailed Description. A color class represented by single floating values. CColor4f supports the <em>RGBA</em>, <em>HSVA</em> and <em>HLSA</em> color models.<br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="E2M">  <div data-ved="0CDkQkgowBA">  <div data-ved="0CDoQkQowBA"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://api.call-cc.org/doc/imlib2" class="l" onmousedown="return rwt(this,'','','','5','AFQjCNHJCEk0145qLjFUDow7uz6--d9ecQ','','0CDsQFjAE',null,event)">imlib2 | chickadee</a></h3><div class="s"><div class="f kv"><cite>api.call-cc.org/doc/imlib2</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:f3rEHKk7xdQJ:api.call-cc.org/doc/imlib2+&amp;cd=5&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','5','AFQjCNFIOeaytJwKaUCFahhv0rZYsCrtNA','','0CDwQIDAE',null,event)">Cached</a></span></div><div class="esc slp" id="poS4" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">Create a color specifier for the given <em>RGBA</em> values. (color/<em>hsva h s v a</em>) =&gt; color procedure. Create a color specifier for the given <em>HSVA</em> values. (color/<em>hlsa h l s a</em>) <b>...</b><br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="QIQ">  <div data-ved="0CD4QkgowBQ">  <div data-ved="0CD8QkQowBQ"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://download.autodesk.com/global/docs/softimage2013/en_us/userguide/files/shaderpresets690.htm" class="l" onmousedown="return rwt(this,'','','','6','AFQjCNEcMDDSe_MGltcAPZgdz_Xba5qhrA','','0CEAQFjAF',null,event)">Softimage User's Guide: Scalar Matte</a></h3><div class="s"><div class="f kv"><cite>download.autodesk.com/global/docs/.../en.../shaderpresets690.htm</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:7EBn3t17As0J:download.autodesk.com/global/docs/softimage2013/en_us/userguide/files/shaderpresets690.htm+&amp;cd=6&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','6','AFQjCNH9A6Y1UtnSCaNB5fq9oEMv6l6dIQ','','0CEEQIDAF',null,event)">Cached</a></span></div><div class="esc slp" id="poS5" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">Determines which color model you will select a color from: <em>RGBA</em>, <em>HLSA</em>, or <em>HSVA</em>. Channel. Selects the color channel. You can only select from this parameter if <b>...</b><br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="ziI">  <div data-ved="0CEMQkgowBg">  <div data-ved="0CEQQkQowBg"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://packages.python.org/python-igraph/igraph.drawing.colors-module.html" class="l" onmousedown="return rwt(this,'','','','7','AFQjCNFXOQ9ZwDTXEYanvZb2l-be3NWrvg','','0CEUQFjAG',null,event)">igraph.drawing.colors.color_name_to_rgba</a></h3><div class="s"><div class="f kv"><cite>packages.python.org/python.../igraph.drawing.colors-module.html</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:YiCWholHT38J:packages.python.org/python-igraph/igraph.drawing.colors-module.html+&amp;cd=7&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','7','AFQjCNE-O44T9i6KfHzrDu2_TN6q5khIDg','','0CEYQIDAG',null,event)">Cached</a></span></div><div class="esc slp" id="poS6" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="f">10+ items – </span><span class="st">Converts a color given by its <em>RGBA</em> coordinates to <em>HSVA</em> <b>...</b><br></span><table class="tsnip"><tbody><tr><td>hsla_to_rgba</td><td>l</td><td>1.0</td><td>Converts a color given by its <em>HSLA</em> coordinates (hue <b>...</b></td></tr><tr><td>rgba_to_hsva</td><td>b</td><td>1.0</td><td>Converts a color given by its <em>RGBA</em> coordinates to <b>...</b></td></tr></tbody></table></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="ELy">  <div data-ved="0CEkQkgowBw">  <div data-ved="0CEoQkQowBw"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://xrvg.rubyforge.org/rdoc/classes/XRVG/Color.html" class="l" onmousedown="return rwt(this,'','','','8','AFQjCNFa4U6Y0BceJtaHJrjPYBL9uluwiQ','','0CEsQFjAH',null,event)">Class: XRVG::Color</a></h3><div class="s"><div class="f kv"><cite>xrvg.rubyforge.org/rdoc/classes/XRVG/Color.html</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:xzEzVZQqjlsJ:xrvg.rubyforge.org/rdoc/classes/XRVG/Color.html+&amp;cd=8&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','8','AFQjCNHAiQ1acFXS45OCe2dh8t04dv79Zw','','0CEwQIDAH',null,event)">Cached</a></span></div><div class="esc slp" id="poS7" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st"><b>...</b> blue complement format255 g g= green grey hsl <em>hsla hsla</em> hsv <em>hsva hsva</em> hue <b>...</b> rgb2h rgb2hsl rgb2hsv rgb2sl rgb2sv <em>rgba</em> saturation svg value white yellow <b>...</b><br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc" sig="AMw">  <div data-ved="0CE4QkgowCA">  <div data-ved="0CE8QkQowCA"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://pygame.readthedocs.org/en/latest/ref/color.html" class="l" onmousedown="return rwt(this,'','','','9','AFQjCNFAhMqlvpl3nu5ke-K5EGXFjdObuA','','0CFAQFjAI',null,event)">pygame.Color — Pygame v1.9.2 documentation</a></h3><div class="s"><div class="f kv"><cite>pygame.readthedocs.org/en/latest/ref/color.html</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:y8n_BdkVwnMJ:pygame.readthedocs.org/en/latest/ref/color.html+&amp;cd=9&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','9','AFQjCNFUzsehzHw76YNkL-4oahK-n-JvIA','','0CFEQIDAI',null,event)">Cached</a></span></div><div class="esc slp" id="poS8" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">Color.<em>hsva</em>, —, Gets or sets the <em>HSVA</em> representation of the Color. <b>...</b> The Color class represents <em>RGBA</em> color values using a value range of 0-255. <b>...</b> The <em>HSLA</em>  components are in the ranges H = [0, 360], S = [0, 100], V = [0, 100], A = [0, 100].<br></span></div></div><!--n--></li><!--m--><li class="g"><div class="vsc">  <div data-ved="0CFMQkgowCQ">  <div data-ved="0CFQQkQowCQ"> </div>   </div><div class="vspib" aria-label="Result details" role="button" tabindex="0"><div class="vspii"><div class="vspiic"></div></div></div>   <h3 class="r"><a href="http://pypixel.googlecode.com/svn-history/r5/trunk/pypixel.py" class="l" onmousedown="return rwt(this,'','','','10','AFQjCNFn-7_0BAq_upcNihBWBYg3GediLg','','0CFUQFjAJ',null,event)">#!/usr/bin/python # # TODO # Thread pypixel so that the end user <b>...</b></a></h3><div class="s"><div class="f kv"><cite>pypixel.googlecode.com/svn-history/r5/trunk/pypixel.py</cite><span class="vshid"><a href="http://webcache.googleusercontent.com/search?q=cache:-prP6iP75CAJ:pypixel.googlecode.com/svn-history/r5/trunk/pypixel.py+&amp;cd=10&amp;hl=en&amp;ct=clnk&amp;gl=au" onmousedown="return rwt(this,'','','','10','AFQjCNE0O_xZcdco81OFkgu3JTMawmr5LQ','','0CFYQIDAJ',null,event)">Cached</a></span></div><div class="esc slp" id="poS9" style="display:none">You +1'd this publicly.&nbsp;<a href="#" class="fl">Undo</a></div><span class="st">TODO # Write color wrapper for <em>HSVA</em>, <em>HSLA</em>, <em>RGBA</em> # import random as randy import pygame from pygame.locals import * # Screen size WIDTH = 640 HEIGHT <b>...</b><br></span></div></div><!--n--></li></ol>