为了让您知道我需要什么,我一直在使用以下代码来解析
标签内的内容并将每个句子包装在标签中,以便我可以与页面上的句子进行交互。 / p>
$('p').each(function() {
var sentences = $(this)
.text()
.replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g,
'<span class="sentence">$1</span>$3');
$(this).html(sentences);
});
但是,以下一行说明了我的问题:
<p>This is a <a href="#">link</a> and it is removed with the above code! Here is another sentence.</p>
嵌套代码,例如&lt; a&gt;,&lt; img&gt;等...在&lt; p&gt;内我正在搜索的标签将使用我正在使用的代码删除。我需要保持这些标签的完整性,因此内容在&lt; p&gt;内保持不变。标签。
我需要:
<p><span class="sentence">This is a <a href="#">link</a> and it is removed with the above code!</sentence><sentence>Here is another sentence.</sentence></p>
在阅读this barn-burner关于使用正则表达式解析HTML之后,我得出结论,我需要使用某种HTML解析器的组合来遍历&lt; p&gt;内的子标记。标记,然后使用正则表达式来查找句子。我认为上面列出的正则表达式应该适用于我的大部分用途,如果这有用的话。
所以:我该怎么做?
答案 0 :(得分:0)
tokenise语言很可靠,很可靠地进入句子,并且没有将html投入等式的额外复杂性。有一些应用程序等尝试处理Natural Language Processing,一个例子是Stanford Tokenizer在Java上运行(不是Javascript)
正如人们不断提到的那样,正则表达式不是这个问题的解决方案,语言不规则,所以不要指望只使用正则表达式解决方案。
这里有一个关于SO的问题,Basic NLP in CoffeeScript or JavaScript — Punkt tokenizaton, simple trained Bayes models — where to start?我认为这些问题很简单地总结了Javascript。
无论如何,至少给你一些你可以玩的东西,我为你敲了一些代码。这种方法很有效,直到标记/语言开始类似于任何稍微复杂或不同的东西,但最终会在很长一段时间内失败。但是,对于你需要的东西,我可能已经足够了,我不知道。
CSS
.emphasis {
font-style: italic;
}
.bold {
font-weight: bold;
}
.emphasis.bold {
font-style: italic;
font-weight: bold;
}
.unidentified {
background-color: pink;
}
.sentence0 {
background-color: yellow;
}
.sentence1 {
background-color: green;
}
.sentence2 {
background-color: red;
}
.whitespace {
white-space: pre;
background-color: blue;
}
的Javascript
/*jslint maxerr: 50, indent: 4, browser: true */
(function () {
"use strict";
var rxOpen = new RegExp("<[^\\/].+?>"),
rxClose = new RegExp("<\\/.+?>"),
rxWhitespace = new RegExp("^\\s+?"),
rxSupStart = new RegExp("^<sup\\b[^>]*>"),
rxSupEnd = new RegExp("<\/sup>"),
sentenceEnd = [],
color = 0,
rxIndex;
sentenceEnd.push(new RegExp("[^\\d][\\.!\\?]+"));
sentenceEnd.push(new RegExp("(?=([^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*?$)"));
sentenceEnd.push(new RegExp("(?![^\\(]*?\\))"));
sentenceEnd.push(new RegExp("(?![^\\[]*?\\])"));
sentenceEnd.push(new RegExp("(?![^\\{]*?\\})"));
sentenceEnd.push(new RegExp("(?![^\\|]*?\\|)"));
//sentenceEnd.push(new RegExp("(?![^\\\\]*?\\\\)"));
//sentenceEnd.push(new RegExp("(?![^\\/.]*\\/)")); // all could be a problem, but this one is problematic
rxIndex = new RegExp(sentenceEnd.reduce(function (previousValue, currentValue) {
return previousValue + currentValue.source;
}, ""));
function indexSentenceEnd(html) {
var index = html.search(rxIndex);
if (index !== -1) {
index += html.match(rxIndex)[0].length - 1;
}
return index;
}
function pushSpan(array, className, string, classNameOpt) {
if (className === "sentence") {
className += color % 2;
if (classNameOpt) {
className += " " + classNameOpt;
}
color += 1;
}
array.push('<span class="' + className + '">' + string + '</span>');
}
function addSupToPrevious(html, array) {
var sup = html.search(rxSupStart),
end = 0,
last;
if (sup !== -1) {
end = html.search(rxSupEnd);
if (end !== -1) {
last = array.pop();
end = end + 6;
array.push(last.slice(0, -7) + html.slice(0, end) + last.slice(-7));
}
}
return html.slice(end);
}
function leadingWhitespaces(html, array) {
var whitespace = html.search(rxWhitespace),
count = 0;
if (whitespace !== -1) {
count = html.match(rxWhitespace)[0].length;
pushSpan(array, "whitespace", html.slice(0, count));
}
return html.slice(count);
}
function paragraphIsSentence(html, array) {
var index = indexSentenceEnd(html);
if (index === -1 || index === html.length) {
pushSpan(array, "sentence", html, "paragraphIsSentence");
html = "";
}
return html;
}
function paragraphNoMarkup(html, array) {
var open = html.search(rxOpen),
index = 0;
if (open === -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
pushSpan(array, "sentence", html.slice(0, index += 1), "paragraphNoMarkup");
}
return html.slice(index);
}
function sentenceUncontained(html, array) {
var open = html.search(rxOpen),
index = 0,
close;
if (open !== -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
close = html.search(rxClose);
if (index < open || index > close) {
pushSpan(array, "sentence", html.slice(0, index += 1), "sentenceUncontained");
} else {
index = 0;
}
}
return html.slice(index);
}
function sentenceContained(html, array) {
var open = html.search(rxOpen),
index = 0,
close,
count;
if (open !== -1) {
index = indexSentenceEnd(html);
if (index === -1) {
index = html.length;
}
close = html.search(rxClose);
if (index > open && index < close) {
count = html.match(rxClose)[0].length;
pushSpan(array, "sentence", html.slice(0, close + count), "sentenceContained");
index = close + count;
} else {
index = 0;
}
}
return html.slice(index);
}
function anythingElse(html, array) {
pushSpan(array, "sentence2", html, "anythingElse");
return "";
}
function guessSenetences() {
var paragraphs = document.getElementsByTagName("p");
Array.prototype.forEach.call(paragraphs, function (paragraph) {
var html = paragraph.innerHTML,
length = html.length,
array = [],
safety = 100;
while (length && safety) {
html = addSupToPrevious(html, array);
if (html.length === length) {
html = leadingWhitespaces(html, array);
if (html.length === length) {
html = paragraphIsSentence(html, array);
if (html.length === length) {
html = paragraphNoMarkup(html, array);
if (html.length === length) {
html = sentenceUncontained(html, array);
if (html.length === length) {
html = sentenceContained(html, array);
if (html.length === length) {
html = anythingElse(html, array);
}
}
}
}
}
}
length = html.length;
safety -= 1;
}
paragraph.innerHTML = array.join("");
});
}
guessSenetences();
}());
上
答案 1 :(得分:-1)
如果要保持标签不变,则需要使用.html()而不是.text()。 检查下面的代码,让我知道它是否不起作用。 的 DEMO 强>
$('p').each(function() {
var sentences = $(this)
.html()
.replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g,
'<span class="sentence">$1</span>$3');
$(this).html(sentences);
});