如何在纯C中进行正则表达式字符串替换?

时间:2011-11-07 23:21:05

标签: c regex string

我已经查看了POSIX正则表达式库和PCRE库中的正则表达式函数,但它们似乎都没有字符串替换函数。我不想使用C ++,如果我不需要链接另一个库(但我可以,如果必须),那将是最好的。我需要手动更换字符串吗?如果是这样,我如何使用捕获组?

4 个答案:

答案 0 :(得分:23)

regex.h不提供对字符串替换的本机支持,但是它确实提供了子表达式/捕获组,使其更容易。我假设您熟悉正则表达式编译并跳过正则表达式执行和子表达式。

regexec()在regex.h中定义如下(/ usr / include /):

extern int regexec (const regex_t *__restrict __preg,
        const char *__restrict __string, size_t __nmatch,
        regmatch_t __pmatch[__restrict_arr],
        int __eflags);

第一个,第二个和最后一个参数分别是要执行的正则表达式,字符串和执行标志。第三个和第四个参数用于指定regmatch_t的数组。 regmatch_t由两个字段组成:rm_so和rm_eo,它们分别是匹配区域的开始和结束的索引或偏移。这些索引可以与string.h中的memcpy()memset()memmove()一起使用,以执行字符串替换。

我会做一个小例子并稍后发布。

祝你好运,我希望这有所帮助。

答案 1 :(得分:6)

PCRE库本身不提供替换功能,但PCRE下载页面提供了一个包装函数,它接受perl样式=~ s/pattern/replace/语法,然后使用PCRE本机函数进行替换/替换您。转到http://www.pcre.org/,然后点击下载链接:ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/,然后点击Contrib目录。您想要的包/项目是:pcrs-0.0.3-src.tar.gz

请注意,我自己没有使用过这个,所以我无法证明它的效果如何。然而,这是一个相当小而简单的代码,所以它可以很好地满足您的目的。

答案 2 :(得分:0)

/* regex_replace.c
   :w | !gcc % -o .%<
   :w | !gcc % -o .%< && ./.%<
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

void  // *str MUST can be freed, i.e. obtainde by strdup, malloc, ...
regex_replace(char **str, const char *pattern, const char *replace) {
    regex_t reg;
    // if regex can't commpile pattern, do nothing
    if(!regcomp(&reg, pattern, REG_EXTENDED)) {
    size_t nmatch = reg.re_nsub; 
    regmatch_t m[nmatch + 1];
    const char *rpl, *p;
    // count back references in replace
    int br = 0;
    p = replace;
    while(1) { 
        while(*++p > 31); 
        if(*p) br++; 
        else break;
    } // if br is not equal to nmatch, leave
    if(br != nmatch) return;
    // look for matches and replace
    char *new;
    while(!regexec(&reg, *str, nmatch + 1, m, REG_NOTBOL)) {
        // make enough room
        new = (char *)malloc(strlen(*str) + strlen(rpl));
        if(!new) exit(EXIT_FAILURE);
        *new = 0;
        p = rpl = replace;
        int c;
        strncat(new, *str, m[0].rm_so); // test before pattern
        for(int k=0; k<nmatch; k++) {
        while(*++p > 16); // skip printable char
        c = *p;  // back referenc (e.g. \1, \2, ...)
        strncat(new, rpl, p - rpl); // add head of rpl
        // concat match
        strncat(new, *str + m[c].rm_so, m[c].rm_eo - m[c].rm_so);
        rpl = p++; // skip back reference, next match
        }
        strcat(new, p ); // trailing of rpl
        strcat(new, *str + m[0].rm_eo); // trainling text in *str
        free(*str);
        *str = strdup(new);
        free(new);
    }
    // ajust size
    *str = (char *)realloc(*str, strlen(*str) + 1);
    } else
    printf("Could not compile regex: %s\n", replace);
}

int main(int argc, char *argv[]) 
{
    char *pattern = "\\[([^-]+)->([^]]+)\\]";
    char *str = strdup("before [link->address] some text [link2->addr2] trail");
    char rpl[] = "<a href=\"\2\">\1</a>";
    puts(str);
    regex_replace(&str, pattern, rpl);
    puts(str);
    free(str);
}

答案 3 :(得分:0)

我已经采纳了@marnout 的帖子并修复了它,解决了许多错误和拼写错误。修复:内存泄漏,如果替换包含模式,则无限替换,用返回值替换函数中的打印,返回引用值实际上高达 31,文档,更多测试示例。

/* regex_replace.c
:w | !gcc % -o .%<
:w | !gcc % -o .%< && ./.%<
:w | !gcc % -o .%< && valgrind -v ./.%<
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

int regex_replace(char **str, const char *pattern, const char *replace) {
    // replaces regex in pattern with replacement observing capture groups
    // *str MUST be free-able, i.e. obtained by strdup, malloc, ...
    // back references are indicated by char codes 1-31 and none of those chars can be used in the replacement string such as a tab.
    // will not search for matches within replaced text, this will begin searching for the next match after the end of prev match
    // returns:
    //   -1 if pattern cannot be compiled
    //   -2 if count of back references and capture groups don't match
    //   otherwise returns number of matches that were found and replaced
    //
    regex_t reg;
    unsigned int replacements = 0;
    // if regex can't commpile pattern, do nothing
    if(!regcomp(&reg, pattern, REG_EXTENDED)) {
        size_t nmatch = reg.re_nsub;
        regmatch_t m[nmatch + 1];
        const char *rpl, *p;
        // count back references in replace
        int br = 0;
        p = replace;
        while(1) {
            while(*++p > 31);
            if(*p) br++;
            else break;
        } // if br is not equal to nmatch, leave
        if(br != nmatch) {
            regfree(&reg);
            return -2;
        }
        // look for matches and replace
        char *new;
        char *search_start = *str;
        while(!regexec(&reg, search_start, nmatch + 1, m, REG_NOTBOL)) {
            // make enough room
            new = (char *)malloc(strlen(*str) + strlen(replace));
            if(!new) exit(EXIT_FAILURE);
            *new = '\0';
            strncat(new, *str, search_start - *str);
            p = rpl = replace;
            int c;
            strncat(new, search_start, m[0].rm_so); // test before pattern
            for(int k=0; k<nmatch; k++) {
                while(*++p > 31); // skip printable char
                c = *p;  // back reference (e.g. \1, \2, ...)
                strncat(new, rpl, p - rpl); // add head of rpl
                // concat match
                strncat(new, search_start + m[c].rm_so, m[c].rm_eo - m[c].rm_so);
                rpl = p++; // skip back reference, next match
            }
            strcat(new, p ); // trailing of rpl
            unsigned int new_start_offset = strlen(new);
            strcat(new, search_start + m[0].rm_eo); // trailing text in *str
            free(*str);
            *str = (char *)malloc(strlen(new)+1);
            strcpy(*str,new);
            search_start = *str + new_start_offset;
            free(new);
            replacements++;
        }
        regfree(&reg);
        // ajust size
        *str = (char *)realloc(*str, strlen(*str) + 1);
        return replacements;
    } else {
        return -1;
    }
}

const char test1[] = "before [link->address] some text [link2->addr2] trail[a->[b->c]]";
const char *pattern1 = "\\[([^-]+)->([^]]+)\\]";
const char replace1[] = "<a href=\"\2\">\1</a>";

const char test2[] = "abcabcdefghijklmnopqurstuvwxyzabc";
const char *pattern2 = "abc";
const char replace2[] = "!abc";

const char test3[] = "a1a1a1a2ba1";
const char *pattern3 = "a";
const char replace3[] = "aa";
int main(int argc, char *argv[])
{
    char *str1 = (char *)malloc(strlen(test1)+1);
    strcpy(str1,test1);
    puts(str1);
    printf("test 1 Before: [%s], ",str1);
    unsigned int repl_count1 = regex_replace(&str1, pattern1, replace1);
    printf("After replacing %d matches: [%s]\n",repl_count1,str1);
    free(str1);

    char *str2 = (char *)malloc(strlen(test2)+1);
    strcpy(str2,test2);
    puts(str2);
    printf("test 2 Before: [%s], ",str2);
    unsigned int repl_count2 = regex_replace(&str2, pattern2, replace2);
    printf("After replacing %d matches: [%s]\n",repl_count2,str2);
    free(str2);

    char *str3 = (char *)malloc(strlen(test3)+1);
    strcpy(str3,test3);
    puts(str3);
    printf("test 3 Before: [%s], ",str3);
    unsigned int repl_count3 = regex_replace(&str3, pattern3, replace3);
    printf("After replacing %d matches: [%s]\n",repl_count3,str3);
    free(str3);
}