如何在JavaScript正则表达式中找到组的索引匹配?

时间:2009-12-31 14:21:26

标签: javascript regex

当我写一个正则表达式时:

var m = /(s+).*?(l)[^l]*?(o+)/.exec("this is hello to you");
console.log(m);

我得到一个包含以下内容的匹配对象:

{
  0: "s is hello",
  1: "s",
  2: "l",
  3: "o",
  index: 3,
  input: "this is hello to you"
}

我知道index属性的整个匹配的索引,但我还需要知道匹配的组的开始和结束。使用简单的搜索将无法正常工作。在这个例子中,它将找到第一个'l'而不是在组中找到的那个。

有没有办法获得匹配组的偏移?

6 个答案:

答案 0 :(得分:15)

您无法直接获取匹配组的索引。你要做的就是先将每个角色放在一个匹配组中,即使是你不关心的角色:

var m= /(s+)(.*?)(l)([^l]*?)(o+)/.exec('this is hello to you');

现在你已经完成了整个比赛:

['s is hello', 's', ' is hel', 'l', '', 'o']

因此,您可以在组之前添加字符串的长度,以获得从匹配索引到组索引的偏移量:

function indexOfGroup(match, n) {
    var ix= match.index;
    for (var i= 1; i<n; i++)
        ix+= match[i].length;
    return ix;
}

console.log(indexOfGroup(m, 3)); // 11

答案 1 :(得分:9)

我写了一个简单的(初始化有点臃肿)javascript对象来解决我最近一直在研究的项目上的这个问题。它的工作方式与接受的答案相同,但会生成新的正则表达式并自动提取您请求的数据。

var exp = new MultiRegExp(/(firstBit\w+)this text is ignored(optionalBit)?/i);
var value = exp.exec("firstbitWithMorethis text is ignored");

value = {0: {index: 0, text: 'firstbitWithMore'},
         1: null};

Git Repo:My MultiRegExp。希望这可以帮助那些人。

编辑2015年8月:

试试我:MultiRegExp Live

答案 2 :(得分:2)

另一个能够解析嵌套组的javascript类位于:https://github.com/valorize/MultiRegExp2

用法:

let regex = /a(?: )bc(def(ghi)xyz)/g;
let regex2 = new MultiRegExp2(regex);

let matches = regex2.execForAllGroups('ababa bcdefghixyzXXXX'));

Will output:
[ { match: 'defghixyz', start: 8, end: 17 },
  { match: 'ghi', start: 11, end: 14 } ]

答案 3 :(得分:1)

我一直在尝试添加嵌套的捕获组和带有位置信息的命名组。 您可以在jsfiddle上使用一些正则表达式... https://jsfiddle.net/smuchow1962/z5dj9gL0/

/*
Copyright (c) 2019 Steven A Muchow
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Enhanced RegEx JS processing
Adds position information for capture groups (nested ones too) AND named group items.
*/
class RegexContainer {

    static _findCaptureGroupsInRegexTemplate(re, input) {
        let refCount = 0; let matches = []; let res; let data;
        re.lastIndex = 0;
        while ((res = re.exec(input)) !== null) {
            if (isCapturingStartItem(res[0])) {
                refCount++;
                data = {parent: 0, refCount: refCount, start: res.index};
                if (res.groups.name) { data.name = res.groups.name; }
                matches.push(data);
            } else if (input.charAt(res.index) === ')') {
                let idx = matches.length;
                while (idx--) {
                    if (matches[idx].end === undefined) {
                        matches[idx].end = re.lastIndex;
                        matches[idx].source = input.substring(matches[idx].start, matches[idx].end);
                        break;
                    }
                }
                refCount--;
                let writeIdx = idx;
                while (idx--) {
                    if (matches[idx].refCount === refCount) {
                        matches[writeIdx].parent = idx + 1;
                        break;
                    }
                }
            }
        }
        matches.unshift({start: 0, end: input.length, source: input});
        return matches;

        function isCapturingStartItem(str) {
            if (str !== '(') { return (str.search(/\(\?<\w/)!==-1); }
            return true;
        }
    }

    static execFull(re, input, foundCaptureItems) {
        let result; let foundIdx; let groupName;  const matches = [];
        while ((result = re.exec(input)) !== null) {
            let array = createCustomResultArray(result);
            array.forEach((match, idx) => {
                if (!idx) {
                    match.startPos = match.endPos = result.index;
                    match.endPos += result[0].length;
                    delete match.parent;
                    return;
                }
                let parentStr = array[match.parent].data;
                foundIdx = (match.parent < idx - 1) ? parentStr.lastIndexOf(match.data) : parentStr.indexOf(match.data);
                match.startPos = match.endPos = foundIdx + array[match.parent].startPos;
                match.endPos += match.data.length;
                if ((groupName = foundCaptureItems[idx].name)) { match.groupName = groupName; }
            });
            matches.push(array);
            if (re.lastIndex === 0) { break; }
        }
        return matches;

        function createCustomResultArray(result) {
            let captureVar = 0;
            return Array.from(result, (data) => {
                return {data: data || '', parent: foundCaptureItems[captureVar++].parent,};
            });
        }
    }

    static mapCaptureAndNameGroups(inputRegexSourceString) {
        let REGEX_CAPTURE_GROUPS_ANALYZER = /((((?<!\\)|^)\((\?((<(?<name>\w+)))|(\?<=.*?\))|(\?<!.*?\))|(\?!.*?\))|(\?=.*?\)))?)|((?<!\\)\)(([*+?](\?)?))?|({\d+(,)?(\d+)?})))/gm;
        return RegexContainer._findCaptureGroupsInRegexTemplate(REGEX_CAPTURE_GROUPS_ANALYZER, inputRegexSourceString);
    }

    static exec(re, input) {
        let foundCaptureItems = RegexContainer.mapCaptureAndNameGroups(re.source);
        let res = RegexContainer.execFull(re, input, foundCaptureItems);
        return {captureItems: foundCaptureItems, results: res};
    }

}

let answers = [];
let regex = [
    { re: "[ \\t]*?\\[\\[(?<inner>\\s*(?<core>\\w(.|\\s)*?)\\s*?)]]", label: "NESTED Regex"},
  { re: "(?<context>((\\w)(\\w|-)*))((?<separator>( - ))?(?<type>(-|\\w)+)?\\s*(?<opt>(\\{.*}))?)?[\\t ]*", label: "simpler regex" },
]

let input = "[[ context1 ]]  [[ context2 - with-style { andOpts : {data: 'some info'} } ]]";

regex.forEach( (item) => {
    let re = new RegExp(item.re, 'gm');
  let result = RegexContainer.exec(re,input);
  result.label = item.label;
  answers.push(result);
});

answers.forEach((answer,index) => {
    console.log('==========================================================');
    console.log('==== Item ' + index + ' label: ' + answer.label + ' regex: ' + answer.captureItems[0].source );
    console.log('==========================================================\n\n');
    let scannedItems = answer.results;
    scannedItems.forEach( (match) => {
        let full = match[0];
        let mstr = full.data;
        let substr = input.substring(full.startPos, full.endPos);
        if (mstr !== substr) {
            console.log('error in the parsing if you get here');
            return;
        }
        console.log('==== Checking ' + mstr);
        for (let i=1; i<match.length; i++) {
            let capture = match[i];
            if (capture.groupName) {
                console.log(' ' + capture.groupName + ': ' + "```" + input.substring(capture.startPos,capture.endPos) + "```");
            }
        }
        console.log('');
    });
});

建筑

  • 使用Regex模板并标识它将生成的捕获组。将其另存为一组组项目和嵌套信息,以馈入扩展的exec()调用。
    • 使用正则表达式查找捕获起点,非捕获元素,捕获名称和捕获结尾。正确捕获可怕的\(和\)项。
    • 对捕获物及其父母进行非递归检查(使用引用计数)。
  • 运行exec()并将捕获组信息拉到上方。
    • 使用子字符串函数为每个捕获组提取数据
    • 将找到的每个结果都放入数组中,然后将其发送回去。

答案 4 :(得分:0)

基于ecma regular expression syntax我编写了一个解析器,它分别是RegExp类的扩展,它解决了这个问题(完全索引的exec方法)以及JavaScript RegExp实现的其他限制,例如:基于组的搜索&安培;更换。您可以test and download the implementation here(与NPM模块一样可用)。

实现如下(小例子):

//Retrieve content and position of: opening-, closing tags and body content for: non-nested html-tags.
var pattern = '(<([^ >]+)[^>]*>)([^<]*)(<\\/\\2>)';
var str = '<html><code class="html plain">first</code><div class="content">second</div></html>';
var regex = new Regex(pattern, 'g');
var result = regex.exec(str);

console.log(5 === result.length);
console.log('<code class="html plain">first</code>'=== result[0]);
console.log('<code class="html plain">'=== result[1]);
console.log('first'=== result[3]);
console.log('</code>'=== result[4]);
console.log(5=== result.index.length);
console.log(6=== result.index[0]);
console.log(6=== result.index[1]);
console.log(31=== result.index[3]);
console.log(36=== result.index[4]);

我尝试了@velop的实现,但是实现似乎有问题,例如它没有正确处理反向引用,例如“/ a(?:)bc(def( \ 1 ghi)xyz)/ g” - 当在前面添加paranthesis时,后向引用 \ 1 需要相应增加(在他的实施中并非如此)。

答案 5 :(得分:0)

对于全局正则表达式,您希望仅匹配片段并进行迭代,因此第一个解决方案无法正常工作。这是一个30分钟的解决方案,基于indexOf和适用于这种情况的总和:

https://codepen.io/cancerberoSgx/pen/qYwjjz?editors=0012#code-area

!function () {
  const regex = /\/\*\*\*@\s*([^@]+)\s*(@\*\*\*\/)/gim
  const exampleThatMatch = `
    /***@
    debug.print('hello editor, simpleNode kind is ' +
    arg.simpleNode.getKindName())
    @***/

    const a = 1 //user

    /***@
    debug.print(arg.simpleNode.getParent().getKindName())
    @***/
    `
  const text = exampleThatMatch 
  function exec(r, s) {
    function indexOfGroup(match, n) {
      var ix = match.index;
      for (var i = 1; i < n; i++)
        ix += match[i].length;
      return ix;
    }
    let result
    let lastMatchIndex = 0
    const matches = []
    while ((result = regex.exec(text))) {
      const match = []
      lastMatchIndex = text.indexOf(result[0], lastMatchIndex)
      let relIndex = 0 
      for (let i = 1; i < result.length; i++) {
        relIndex = text.indexOf(result[i], relIndex)
        match.push({ value: result[i], start: relIndex, end: relIndex + result[i].length })
      }
      matches.push(match)
    }
    return matches
  }
  const groupsWithIndex = exec(regex, text)
  console.log({RESULT: groupsWithIndex })
  // now test - let's remove everything else but matched groups 
  let frag = '' , sep = '\n#######\n'
  groupsWithIndex.forEach(match => match.forEach(group => {
    frag += text.substring(group.start, group.end) + sep
  }))
  console.log('The following are only the matched groups usign the result and text.substring just to verify it works OK:', '\n'+sep)
  console.log(frag)
}()

以防万一这是打字稿:

https://codepen.io/cancerberoSgx/pen/yjrXxx?editors=0012

| 享受