Posix Regex Search中捕获组不正确

时间:2015-10-04 12:23:20

标签: c regex

在C项目中,我编写了一个函数来返回正则表达式搜索中的第一个捕获组。

output of this online parser最好地描述了我期望实现的目标(注意右侧面板上的捕获组输出)。

我写的函数和测试代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <string.h>
#include <assert.h>

typedef int bool;
#define true 1
#define false 0

/*
 * Obtains the first group that matches the regex pattern in input string
 * The output pointer is made to point to:
 *  - in case the regexp compilation succeeded
 *      - the result in case there was a match found
 *      - or NULL in case there was no match
 *  - in case the regexp compilation failed
 *      - the error from the compilation process
 *
 * If there was an error while compiling the input reg_exp, then this function
 * returns false, if not, it returns true.
 *
 * NOTE: The user is responsible for free-ing the memory for *output
 */
bool get_first_match(const char* search_str, const char* reg_exp, char** output)
{
    int res, len;
    regex_t preg;
    regmatch_t pmatch;

    // Compile the input regexp
    if( (res = regcomp(&preg, reg_exp, REG_EXTENDED)) != 0)
    {
        char* error = (char*)malloc(1024*sizeof(char));
        regerror(res, &preg, error, 1024);
        output = &error;
        return false;
    }

    res = regexec(&preg, search_str, 1, &pmatch, REG_NOTBOL);
    if(res == REG_NOMATCH)
    {
        return true;
    }

    len = pmatch.rm_eo - pmatch.rm_so;
    char* result = (char*)malloc( (len + 1) * sizeof(char) );
    memcpy(result, search_str + pmatch.rm_so, len);
    result[len] = 0; // null-terminate the result
    *output = result;
    regfree(&preg);
    return true;
}

int main()
{
    const char* search_str = "param1=blah&param2=blahblah&param3=blahetc&map=/usr/bin/blah.map";
    const char* regexp = "map=([^\\&]*)(&|$)";
    char* output;
    bool status = get_first_match(search_str, regexp, &output);
    if(status){
       if(output) 
           printf("Found match: %s\n", output);
       else
           printf("No match found.");
    }
    else{
       printf("Regex error: %s\n", output);
    }
    free(output);

    return 0;
}

但是,output I get from the C code包含字符串中的map=部分,即使我已在第一个捕获组中明确排除了它。

如果没有map=部分,我该怎么做才能获得捕获组?为什么与我的C程序相比,我从在线解析器得到的结果不同?

1 个答案:

答案 0 :(得分:2)

这里发生的是,你有这样的模式:

const char* regexp = "map=([^\\&]*)(&|$)";

其中,结果(我们称之为数组result)将根据以下内容填充:

result = {
    "map=/usr/bin/blah.map",
    "/usr/bin/blah.map",
    ""
}

现在,因为您按以下方式呼叫regexc

res = regexec(&preg, search_str, 1, &pmatch, REG_NOTBOL);
// Notice the argument 1 here ---^

参数1表示最多只有一个结果存储在pmatch数组中。因此,您从上方获得result[0]。由于您需要第一个匹配的组(而不是整个匹配的字符串),您必须:

  1. pmatch定义为至少2的大小数组。
  2. 2作为参数传递给上面的regexc
  3. 完成上述操作后:

    bool get_first_match(const char* search_str, const char* reg_exp, char** output)
    {
        int res, len;
        regex_t preg;
        regmatch_t pmatch[3];
        // SNIP
        // SNIP
        res = regexec(&preg, search_str, 2, &pmatch, REG_NOTBOL);
        if(res == REG_NOMATCH)
        {
            return true;
        }
        // Notice changes in the lines below
        // I am using pmatch[1] since that is equivalent to our
        // result[1] from above
        len = pmatch[1].rm_eo - pmatch[1].rm_so;
        char* result = (char*) malloc( (len + 1) * sizeof(char) );
        memcpy(result, search_str + pmatch[1].rm_so, len);
        result[len] = 0; // null-terminate the result
        *output = result;
        regfree(&preg);
        return true;
    }
    

    和程序works as expected