JavaScript正则表达式将单词的边界与变音符号匹配

时间:2019-03-07 18:06:10

标签: javascript regex

我必须在文本文档中匹配具有变音符号的单词的单词边界。给定一个word令牌,我的正则表达式看起来像

var wordRegex = new RegExp("\\b(" + word + ")\\b", "g");
while ((match = wordRegex.exec(text)) !== null) {
                            if (match.index > (seen.get(token) || -1)) {
                                var wordStart = match.index;
                                var wordEnd = wordStart + token.length - 1;
                                item.characterOffsetBegin = wordStart;
                                item.characterOffsetEnd = wordEnd;

                                seen.set(token, wordEnd);
                                break;
                            }
                        }

这对于ciaocasa等普通单词也可以。但是,当我在文本中输入peròcosì等单词时,它将无法正常工作

const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
tokens.forEach((token, tokenIndex) => {
  var item = {
    "index": (tokenIndex + 1),
    "word": token
  }
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
  var match = null;
  console.log(token, "---->", wordRegex)
  while ((match = wordRegex.exec(text)) !== null) {
    console.log("\t---->", match.index)
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;

      seen.set(token, wordEnd);
      break;
    }
  }
})

您会看到某些单词(例如macchinenascoste)如何匹配,所以我得到了match.index,而其他单词(例如però)则是正则表达式无法正常工作,并且match变量为null

macchine ----> /\b(macchine)\b/g
    ----> 7
nascoste ----> /\b(nascoste)\b/g
    ----> 16
e, ----> /\b(e\,)\b/g
però, ----> /\b(però\,)\b/g
nascoste ----> /\b(nascoste)\b/g
    ----> 16
    ----> 34

那么如何编写也支持变音符号的边界正则表达式?

[UPDATE] 按照评论中建议的方法,我在应用token之前对每个单词Regex使用了变音符号,然后将其应用于整个text,例如:

var normalizedText = removeDiacritics(text);
// for each token...
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
escaped = removeDiacritics(escaped);
var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
while ((match = wordRegex.exec( normalizedText )) !== null) 
{
                             //...

这次,我将获得带有\b单词边界所捕获的重音的单词。当然,这种方法不是最佳方法,因为必须对每个令牌都应用removeDiacritics,所以最好的解决方案是这样做一次。

1 个答案:

答案 0 :(得分:1)

这是我们在注释中提出的解决方案,用于将带有变音符号的单词映射到其在文本中的索引:

function removeDiacritics(text) {
  return _.deburr(text)
}

const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
  var normalizedText = removeDiacritics(text)
  
tokens.forEach((token, tokenIndex) => {
  var item = {
    "index": (tokenIndex + 1),
    "word": removeDiacritics(token)
  }
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  escaped = removeDiacritics(escaped)
  var wordRegex = new RegExp("\\b(" + escaped + ")\\b", "g");
  var match = null;
  console.log(token, "---->", wordRegex)
  while ((match = wordRegex.exec(normalizedText)) !== null) {
    console.log("\t---->", match.index)
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;

      seen.set(token, wordEnd);
      break;
    }
  }
})
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.11/lodash.min.js"></script>