UTF-8支持Javascript中的正则表达式

时间:2017-05-22 22:08:03

标签: javascript regex utf-8

我正在尝试创建一个Javascript函数,该函数可以在UTF-8字符串中找到一组模式的所有位置。例如:

我有一个字符串“detaj”(这是一个用国际音标符号写的转录,所以我需要一个完整的UTF-8支持)。

我有一系列模式:["(?!dʒ)d", "(?!tʃ)t"](每个字符串也是UTF-8编码的)。

我需要找到每个模式的位置并获得以下数组:

[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]

0 - 是符号“d”的位置,2 - 是符号“t”的位置。

我开始使用此功能: https://stackoverflow.com/a/3410557/2006215

var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
    indices.push(result.index);
}

我把它改成了这样的东西:

function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
    var allIndices = [];
    for (var i = 0; i < sounds_regex_array.length; i++) {
        var currentSoundRegex = sounds_regex_array[i];
        // straightforward approach doesn't work:
        //var pattern = new RegExp(currentSoundRegex, "g"); 
        // hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
        //var pattern = new RegExp(currentSoundRegex.hexEncode, "g"); 
        // I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
        var pattern = new RegExp(utf8.encode(currentSoundRegex), "g"); 
        var indices = getIndicesOfRegex (pattern, word_transcription);
        for (var j = 0; j < indices.length; j++) {
            allIndices.push([indices[j], currentSoundRegex ]);
        }
    }
    return allIndices;
}

function getIndicesOfRegex (regex, str) {
    var result, indices = [];
    while (result = regex.exec(str)) {
        indices.push(result.index);
    }
    return indices;
}

有人有什么想法吗?

更新:我从json文件中获取转录和正则表达式模式,我使用UTF-8字符串生成PHP。我不知道如何调用它,但它不是UTF-8。在任何情况下,它都不适用于我的Javascript函数。

var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];

1 个答案:

答案 0 :(得分:0)

我发现了问题所在。

错误被触发,因为我试图在Javascript中执行lookbehind,这是不受支持的。

此处提出了自定义lookbehind函数的解决方法 - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind

但最后我只是对代码进行了自己的修改。上面的函数需要XRegExp库,这非常重。

我的解决方案:

function getIndicesOfRegex (currentSoundRegex, pattern, str) {
    var result, indices = [];
    while (result = pattern.exec(str)) {
        if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
        if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
        indices.push(result.index);
    }
    return indices;
}

function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
    var allIndices = [];
    for (var i = 0; i < sounds_regex_array.length; i++) {
        var currentSoundRegex = sounds_regex_array[i];
        // lookbehind doesn't work in Javascript:
        // possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
        if (currentSoundRegex === "(?<!d)ʒ") {
            currentSoundRegex = "ʒ";
        }
        if (currentSoundRegex === "(?<!t)ʃ") {
            currentSoundRegex = "ʃ";
        }
        var pattern = new RegExp(currentSoundRegex, "g");

        var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
        var currentSound = sounds_array[i];
        for (var j = 0; j < indices.length; j++) {
            allIndices.push([indices[j], currentSound]);
        }
    }
    return allIndices;
}