我必须在文本文档中匹配具有变音符号的单词的单词边界。给定一个word
令牌,我的正则表达式看起来像
var wordRegex = new RegExp("\\b(" + word + ")\\b", "g");
while ((match = wordRegex.exec(text)) !== null) {
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
这对于ciao
,casa
等普通单词也可以。但是,当我在文本中输入però
,così
等单词时,它将无法正常工作
const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
tokens.forEach((token, tokenIndex) => {
var item = {
"index": (tokenIndex + 1),
"word": token
}
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
console.log(token, "---->", wordRegex)
while ((match = wordRegex.exec(text)) !== null) {
console.log("\t---->", match.index)
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
})
您会看到某些单词(例如macchine
或nascoste
)如何匹配,所以我得到了match.index
,而其他单词(例如però
)则是正则表达式无法正常工作,并且match
变量为null
:
macchine ----> /\b(macchine)\b/g
----> 7
nascoste ----> /\b(nascoste)\b/g
----> 16
e, ----> /\b(e\,)\b/g
però, ----> /\b(però\,)\b/g
nascoste ----> /\b(nascoste)\b/g
----> 16
----> 34
那么如何编写也支持变音符号的边界正则表达式?
[UPDATE]
按照评论中建议的方法,我在应用token
之前对每个单词Regex
使用了变音符号,然后将其应用于整个text
,例如:
var normalizedText = removeDiacritics(text);
// for each token...
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
escaped = removeDiacritics(escaped);
var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
while ((match = wordRegex.exec( normalizedText )) !== null)
{
//...
这次,我将获得带有\b
单词边界所捕获的重音的单词。当然,这种方法不是最佳方法,因为必须对每个令牌都应用removeDiacritics
,所以最好的解决方案是这样做一次。
答案 0 :(得分:1)
这是我们在注释中提出的解决方案,用于将带有变音符号的单词映射到其在文本中的索引:
function removeDiacritics(text) {
return _.deburr(text)
}
const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
var normalizedText = removeDiacritics(text)
tokens.forEach((token, tokenIndex) => {
var item = {
"index": (tokenIndex + 1),
"word": removeDiacritics(token)
}
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
escaped = removeDiacritics(escaped)
var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
console.log(token, "---->", wordRegex)
while ((match = wordRegex.exec(normalizedText)) !== null) {
console.log("\t---->", match.index)
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
})
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.11/lodash.min.js"></script>