我正在尝试创建一个Javascript函数,该函数可以在UTF-8字符串中找到一组模式的所有位置。例如:
我有一个字符串“detaj”(这是一个用国际音标符号写的转录,所以我需要一个完整的UTF-8支持)。
我有一系列模式:["(?!dʒ)d", "(?!tʃ)t"]
(每个字符串也是UTF-8编码的)。
我需要找到每个模式的位置并获得以下数组:
[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]
0 - 是符号“d”的位置,2 - 是符号“t”的位置。
我开始使用此功能: https://stackoverflow.com/a/3410557/2006215
var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
我把它改成了这样的东西:
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// straightforward approach doesn't work:
//var pattern = new RegExp(currentSoundRegex, "g");
// hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
//var pattern = new RegExp(currentSoundRegex.hexEncode, "g");
// I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
var pattern = new RegExp(utf8.encode(currentSoundRegex), "g");
var indices = getIndicesOfRegex (pattern, word_transcription);
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSoundRegex ]);
}
}
return allIndices;
}
function getIndicesOfRegex (regex, str) {
var result, indices = [];
while (result = regex.exec(str)) {
indices.push(result.index);
}
return indices;
}
有人有什么想法吗?
更新:我从json文件中获取转录和正则表达式模式,我使用UTF-8字符串生成PHP。我不知道如何调用它,但它不是UTF-8。在任何情况下,它都不适用于我的Javascript函数。
var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];
答案 0 :(得分:0)
我发现了问题所在。
错误被触发,因为我试图在Javascript中执行lookbehind,这是不受支持的。
此处提出了自定义lookbehind函数的解决方法 - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
但最后我只是对代码进行了自己的修改。上面的函数需要XRegExp库,这非常重。
我的解决方案:
function getIndicesOfRegex (currentSoundRegex, pattern, str) {
var result, indices = [];
while (result = pattern.exec(str)) {
if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
indices.push(result.index);
}
return indices;
}
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// lookbehind doesn't work in Javascript:
// possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
if (currentSoundRegex === "(?<!d)ʒ") {
currentSoundRegex = "ʒ";
}
if (currentSoundRegex === "(?<!t)ʃ") {
currentSoundRegex = "ʃ";
}
var pattern = new RegExp(currentSoundRegex, "g");
var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
var currentSound = sounds_array[i];
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSound]);
}
}
return allIndices;
}