我希望从html中提取超过100个字符的文本。文本可以在任何标签集之间,忽略br,b,strong和i标签。
<([^>]+)>[^<]{100,})<\1>
几乎是正确的,但它会在所有标记处停止
我需要它停在除了陈述之外的任何标签上。
<([^>]+)>
if not < and not (<b|<i|<strong|<br)
{100,}
<\1>
我无法弄清楚如何做到这一点,但这大致是我所追求的。
答案 0 :(得分:0)
如果我正确理解了问题。
原始压缩正则表达式 -
(?!</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>)<([A-Za-z_:][\w:.-]*)\s*(?>\s(?:".*?"|'.*?'|[^>]*?)+>|>)(?<!/>)((?:</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>|(?!</?[A-Za-z_:][\w:.-]*\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>).){100,})</\1\s*>
Perl testcase,正则表达式在PHP中工作 此测试用例使用此网页中的来源 它被剥离在 DATA 部分。
$rx = qr~
# ----------------
# Begin REGEX
# ----------------
# - - - - - Exclude these tags - - - - -
(?!</?(?:b|i|strong|br)\s*(?:\s(?:".*?"|'.*?'|[^>]*?)+)?/?>)
# - - - - - Open tags - - - - -
<
([A-Za-z_:][\w:.-]*)
\s*
(?>
\s(?:".*?"|'.*?'|[^>]*?)+ > | >
)
(?<!/>)
# - - - - - Data - - - - -
(
(?:
</? (?: b | i | strong | br) \s* (?:\s(?:".*?"|'.*?'|[^>]*?)+)? /?>
| (?! </? [A-Za-z_:][\w:.-]* \s* (?:\s(?:".*?"|'.*?'|[^>]*?)+)? /?> )
.
){100,}
)
# - - - - - Closing tag - - - - -
</\1\s*>
# ----------------
# End REGEX, Options: expanded, dot '.' includes newline
# ----------------
~xs;
$/ = undef;
$str = <DATA>;
while ($str =~ /$rx/g)
{
print "\n----------------------\n";
print "Tag = $1\n";
print "data = $2\n\n";
}
__DATA__
<!DOCTYPE html>
<html>
...
</html>
输出 - 这是从80列控制台剪切和粘贴的。
----------------------
Tag = p
data = I am looking to extract text longer than 100 characters from html. The te
xt can be between any set of tags, ignoring br, b, strong and i tags.
----------------------
Tag = span
data = I am using PHP, my experience was the html parsers do not like invalid ht
ml and html I am working on is horrendous
----------------------
Tag = span
data = You can replace occurrences of the said tags with empty string, that woul
d simplify things, would it not?...
----------------------
Tag = span
data = I do still need the tags... but maybe i could use place holders like <
br> = %br or similar. That might work....
----------------------
Tag = script
data =
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
----------------------
Tag = script
data =
window.showNewUser = true;
(function(){
var cs = window.sessionStorage && window.sessionStorage['tNe
wsletter'];
var m = /:\/\/([^\/]*)/.exec(document.referrer);
var h = {'www.reddit.com':1,'news.ycombinator.com':1};
if(window.sessionStorage && (cs || (m && m.length == 2 && h[
m[1]]))){
showNewUser = false;
StackExchange.ready(function(){
StackExchange.newsletterAd.loader(m[1], cs);
});
}
if (window.localStorage && showNewUser){
var c = parseInt(localStorage['nuCounter'],10);
c = isNaN(c) ? 1 : c+1;
window.localStorage['nuCounter'] = c;
showNewUser = c < 10;
}
})()
----------------------
Tag = p
data = This is a collaboratively edited question and answer site for <b>professi
onal and enthusiast programmers</b>. It's 100% free, no registration required.
----------------------
Tag = script
data =
if (showNewUser) {
document.getElementById('newuser-box').style.display = '';
}
----------------------
Tag = script
data =
var scriptSrc = "http://engine2.adzerk.net/z/8277/adzerk1_2_4_43,adz
erk2_2_17_45?keywords=regex";
if (document.referrer) {
if (/\?/.test(scriptSrc))
scriptSrc += "&";
else
scriptSrc += "?";
scriptSrc += "xReferrer=" + document.referrer;
}
StackExchange.ready(function(){var z = document.createElement("scrip
t"); z.type = "text/javascript"; z.async = "true"; z.src = scriptSrc; var s = do
cument.getElementsByTagName("script")[0]; s.parentNode.insertBefore(z, s);});
----------------------
Tag = script
data =
var careers_adurl="http://careers.stackoverflow.com/ad/js",careers_c
ssurl="http://cdn.sstatic.net/careers/ads/sidebar.min.css?363c15",careers_leader
boardcssurl="http://cdn.sstatic.net/careers/ads/ninja.min.css?363c15",careers_co
mpanycssurl="http://cdn.sstatic.net/careers/ads/company",careers_adselector="div
.hireme, div#hireme";StackExchange.ready(function(){$.ajax({url:"http://cdn.ssta
tic.net/careers/ads/adloader.min.js?363c15",dataType:"script",cache:true})});
----------------------
Tag = script
data = var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-5620270-1']);
_gaq.push(['_setCustomVar', 1, 'tags', '|regex|']);
_gaq.push(['_trackPageview']);
var _qevents = _qevents || [];
(function(){
var s=document.getElementsByTagName('script')[0];
var ga=document.createElement('script');
ga.type='text/javascript';
ga.async=true;
ga.src='http://www.google-analytics.com/ga.js';
s.parentNode.insertBefore(ga,s);
var sc=document.createElement('script');
sc.type='text/javascript';
sc.async=true;
sc.src='http://edge.quantserve.com/quant.js';
s.parentNode.insertBefore(sc,s);
})();
----------------------
Tag = script
data =
StackExchange.ready(function () {
StackExchange.question.init({showAnswerHelp:true,totalCommentCount:5
,shownCommentCount:5,highlightColor:'#F4A83D',backgroundColor:'#FFF',questionId:
10061299});
styleCode();
StackExchange.realtime.subscribeToQuestion('1', '10061299');
});
----------------------
Tag = script
data =
StackExchange.init({"stackAuthUrl":"https://stackauth.com","serverTime":
1333905762,"styleCode":true,"enableUserHovercards":true,"site":{"name":"Stack Ov
erflow","description":"Q&A for professional and enthusiast programmers"},"user":
{"isAnonymous":true,"fkey":"cabcf78a4b0938630036bcdce8d1268d","hasNewMessages":n
ull,"inboxUnviewedCount":-1}});
StackExchange.using.setCacheBreakers({"js/prettify-full.js":"4ed9aa4b8d0
e","js/moderator.js":"f57f00dcb1ac","js/full-anon.js":"7f2dec862ddc","js/full.js
":"df7d7ab85566","js/wmd.js":"78027d6539c8","js/third-party/jquery.autocomplete.
min.js":"e5f01e97f7c3","js/mobile.js":"3b13ff9fd1f2","js/help.js":"fc9fb0517db2"
,"js/tageditor.js":"c1ba807b32aa","js/tageditornew.js":"bd66fabe1c71","js/inline
-tag-editing.js":"be882e188985","js/revisions.js":"8c6bcd93b7fe","js/suggested-e
dits.js":"7f24e342d359","js/probes.js":"beb933322ff0"});
----------------------
Tag = script
data =
StackExchange.ready(function () {
var shareUrl = 'http%3a%2f%2fstackoverflow.com%2fq%2f10061299';
var shareMsg = 'Ignore+br%2c+b%2c+strong+and+i+html+tags+in+regex';
StackExchange.share.gplus($('#gp-share-10061299'), shareUrl, shareMs
g) ;
StackExchange.share.facebook($('#fb-share-10061299'), shareUrl, shar
eMsg);
StackExchange.share.twitter($('#twitter-share-10061299'), shareUrl,
shareMsg);
});
----------------------
Tag = script
data =
StackExchange.ready(function() {
initTagRenderer("".split(" "), "".split(" "));
prepareEditor({
heartbeatType: 'answer',
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
});