Question

对于给定的ASCII或Unicode文本，我必须用单词边界替换单词标记。我的方法是使用考虑了Unicode和非Unicode大小写的正则表达式来查找令牌，请参见here，然后匹配在RegExp.exec上迭代的令牌的出现。跟踪最后看到的令牌：

/**
 * Replace tokens in text
 * It support both Unicode and ASCII Regex for word boundaries
 * @param {*} isUnicode true to use Unicode Regex
 * @param {*} text input text
 * @param {*} tokens Tokens indexes
 * @param {*} words Words
 * @param {*} expr Replacement 
 */
var replaceTokens = function (isUnicode, text, tokens, words, expr) {
    const seen = new Map();
    var pattern = "\\b($1)\\b";
    if (isUnicode) pattern = "(?<!\\S)$1(?!\\S)";
    tokens.forEach((token, index) => {
        var word = words[index];
        var escaped = word.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
        var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");
        var match = null;
        while ((match = wordRegex.exec(text)) !== null) {
            if (match.index > (seen.get(word) || -1)) {
                var wordEnd = match.index + word.length - 1;
                var replacement = expr[token].replace('$1', escaped);
                text = text.slice(0, match.index) + replacement + text.slice(wordRegex.lastIndex);
                seen.set(word, wordEnd);
                break;
            }
        }
    });
    return text;
}//replaceTokens

因此，要使用它，我需要text中的源文本，words的数组，单词索引的数组tokens，关键字的映射expr替代品

text = "Hello Word"
words = ["Hello", "Word"]
tokens = ["1", "2"]
expr = { "1" : "<span style="color:blue;">$1</span>", "2": "<span style="color:red;">$1</span>"}
replaceTokens(false, text, tokens, words, expr)

或在Unicode情况下：

text = "तुम मुझे दोस्त कहते कहते हो"
words = ["तुम","मुझे","दोस्त","कहते","कहते","हो"]
//...tokens and expr index
replaceTokens(true, text, tokens, words, expr)

由于嵌套操作，我不确定这是否是最佳解决方案。我尝试使用String.replace并指定一个替换函数-请参见here，而不是使用嵌套的RegExp.exec，但是我不知道该怎么做。在一个更简单的解决方案中，我喜欢

text = text.replace(new RegExp("\\b(" + word + ")\\b", "gi"), expr[token]);

但是这种方法不会考虑单词在多个位置的重复（它会替换所有单词），因此即使计算效率更高，它也无法正常工作。

如何在JavaScript中用文字边界中的单词边界替换标记

0 个答案: