忽略正则表达式中的br,b,strong和i html标记

时间:2012-04-08 07:52:52

标签: regex

我希望从html中提取超过100个字符的文本。文本可以在任何标签集之间,忽略br,b,strong和i标签。

<([^>]+)>[^<]{100,})<\1>几乎是正确的,但它会在所有标记处停止

我需要它停在除了陈述之外的任何标签上。

<([^>]+)>

if not < and not (<b|<i|<strong|<br)

{100,}

<\1>

我无法弄清楚如何做到这一点,但这大致是我所追求的。

1 个答案:

答案 0 :(得分:0)

如果我正确理解了问题。

原始压缩正则表达式 -

(?!</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>)<([A-Za-z_:][\w:.-]*)\s*(?>\s(?:".*?"|'.*?'|[^>]*?)+>|>)(?<!/>)((?:</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>|(?!</?[A-Za-z_:][\w:.-]*\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>).){100,})</\1\s*>

Perl testcase,正则表达式在PHP中工作 此测试用例使用此网页中的来源 它被剥离在 DATA 部分。

$rx = qr~
# ----------------
# Begin REGEX
# ----------------

         # - - - - - Exclude these tags - - - - -
(?!</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>)

         # - - - - - Open tags - - - - -
<
  ([A-Za-z_:][\w:.-]*)
    \s*
  (?>
    \s(?:".*?"|'.*?'|[^>]*?)+ > | >
  )
  (?<!/>)
         # - - - - - Data - - - - -
(
  (?:
      </? (?: b | i | strong | br)  \s* (?:\s(?:".*?"|'.*?'|[^>]*?)+)? /?>
    | (?! </? [A-Za-z_:][\w:.-]* \s* (?:\s(?:".*?"|'.*?'|[^>]*?)+)? /?> )
      .
  ){100,}
)
         # - - - - - Closing tag - - - - -
</\1\s*>

# ----------------
# End REGEX, Options:  expanded, dot '.' includes newline
# ----------------
~xs;


$/ = undef;
$str = <DATA>;

while ($str =~ /$rx/g)
{
   print "\n----------------------\n";
   print "Tag  = $1\n";
   print "data = $2\n\n";
}

__DATA__

<!DOCTYPE html>
<html>
...
</html>

输出 - 这是从80列控制台剪切和粘贴的。

----------------------
Tag  = p
data = I am looking to extract text longer than 100 characters from html. The te
xt can be between any set of tags, ignoring br, b, strong and i tags.


----------------------
Tag  = span
data = I am using PHP, my experience was the html parsers do not like invalid ht
ml and html I am working on is horrendous


----------------------
Tag  = span
data = You can replace occurrences of the said tags with empty string, that woul
d simplify things, would it not?...


----------------------
Tag  = span
data = I do still need the tags... but maybe i could use place holders like &lt;
br&gt; = %br or similar. That might work....


----------------------
Tag  = script
data =
                StackExchange.ready(function () {
                    StackExchange.helpers.onClickDraftSave('#login-link');
                });



----------------------
Tag  = script
data =
        window.showNewUser = true;

                (function(){
                    var cs = window.sessionStorage && window.sessionStorage['tNe
wsletter'];
                    var m = /:\/\/([^\/]*)/.exec(document.referrer);
                    var h = {'www.reddit.com':1,'news.ycombinator.com':1};
                    if(window.sessionStorage && (cs || (m && m.length == 2 && h[
m[1]]))){
                        showNewUser = false;
                        StackExchange.ready(function(){
                            StackExchange.newsletterAd.loader(m[1], cs);
                        });
                    }
                    if (window.localStorage && showNewUser){
                        var c = parseInt(localStorage['nuCounter'],10);
                        c = isNaN(c) ? 1 : c+1;
                        window.localStorage['nuCounter'] = c;
                        showNewUser = c < 10;
                    }
                })()




----------------------
Tag  = p
data = This is a collaboratively edited question and answer site for <b>professi
onal and enthusiast programmers</b>. It's 100% free, no registration required.


----------------------
Tag  = script
data =
        if (showNewUser) {
            document.getElementById('newuser-box').style.display = '';
        }



----------------------
Tag  = script
data =
            var scriptSrc = "http://engine2.adzerk.net/z/8277/adzerk1_2_4_43,adz
erk2_2_17_45?keywords=regex";
            if (document.referrer) {
                if (/\?/.test(scriptSrc))
                    scriptSrc += "&";
                else
                    scriptSrc += "?";
                scriptSrc += "xReferrer=" + document.referrer;
            }
            StackExchange.ready(function(){var z = document.createElement("scrip
t"); z.type = "text/javascript"; z.async = "true"; z.src = scriptSrc; var s = do
cument.getElementsByTagName("script")[0]; s.parentNode.insertBefore(z, s);});



----------------------
Tag  = script
data =
            var careers_adurl="http://careers.stackoverflow.com/ad/js",careers_c
ssurl="http://cdn.sstatic.net/careers/ads/sidebar.min.css?363c15",careers_leader
boardcssurl="http://cdn.sstatic.net/careers/ads/ninja.min.css?363c15",careers_co
mpanycssurl="http://cdn.sstatic.net/careers/ads/company",careers_adselector="div
.hireme, div#hireme";StackExchange.ready(function(){$.ajax({url:"http://cdn.ssta
tic.net/careers/ads/adloader.min.js?363c15",dataType:"script",cache:true})});



----------------------
Tag  = script
data = var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-5620270-1']);
        _gaq.push(['_setCustomVar', 1, 'tags', '|regex|']);
_gaq.push(['_trackPageview']);
    var _qevents = _qevents || [];
    (function(){
        var s=document.getElementsByTagName('script')[0];
        var ga=document.createElement('script');
        ga.type='text/javascript';
        ga.async=true;
        ga.src='http://www.google-analytics.com/ga.js';
        s.parentNode.insertBefore(ga,s);
        var sc=document.createElement('script');
        sc.type='text/javascript';
        sc.async=true;
        sc.src='http://edge.quantserve.com/quant.js';
        s.parentNode.insertBefore(sc,s);
    })();



----------------------
Tag  = script
data =

        StackExchange.ready(function () {

            StackExchange.question.init({showAnswerHelp:true,totalCommentCount:5
,shownCommentCount:5,highlightColor:'#F4A83D',backgroundColor:'#FFF',questionId:
10061299});

            styleCode();

                StackExchange.realtime.subscribeToQuestion('1', '10061299');
        });



----------------------
Tag  = script
data =
        StackExchange.init({"stackAuthUrl":"https://stackauth.com","serverTime":
1333905762,"styleCode":true,"enableUserHovercards":true,"site":{"name":"Stack Ov
erflow","description":"Q&A for professional and enthusiast programmers"},"user":
{"isAnonymous":true,"fkey":"cabcf78a4b0938630036bcdce8d1268d","hasNewMessages":n
ull,"inboxUnviewedCount":-1}});
        StackExchange.using.setCacheBreakers({"js/prettify-full.js":"4ed9aa4b8d0
e","js/moderator.js":"f57f00dcb1ac","js/full-anon.js":"7f2dec862ddc","js/full.js
":"df7d7ab85566","js/wmd.js":"78027d6539c8","js/third-party/jquery.autocomplete.
min.js":"e5f01e97f7c3","js/mobile.js":"3b13ff9fd1f2","js/help.js":"fc9fb0517db2"
,"js/tageditor.js":"c1ba807b32aa","js/tageditornew.js":"bd66fabe1c71","js/inline
-tag-editing.js":"be882e188985","js/revisions.js":"8c6bcd93b7fe","js/suggested-e
dits.js":"7f24e342d359","js/probes.js":"beb933322ff0"});




----------------------
Tag  = script
data =
        StackExchange.ready(function () {
            var shareUrl = 'http%3a%2f%2fstackoverflow.com%2fq%2f10061299';
            var shareMsg = 'Ignore+br%2c+b%2c+strong+and+i+html+tags+in+regex';

            StackExchange.share.gplus($('#gp-share-10061299'), shareUrl, shareMs
g) ;
            StackExchange.share.facebook($('#fb-share-10061299'), shareUrl, shar
eMsg);
            StackExchange.share.twitter($('#twitter-share-10061299'), shareUrl,
shareMsg);
        });



----------------------
Tag  = script
data =
    StackExchange.ready(function() {
        initTagRenderer("".split(" "), "".split(" "));

        prepareEditor({
            heartbeatType: 'answer',
            bindNavPrevention: true,
            postfix: "",
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
        });

    });