如何在纯C中进行正则表达式字符串替换?

16

我查看了POSIX正则表达式库和PCRE库中的正则表达式函数,但是它们似乎都没有字符串替换功能。我不想使用C++,如果不需要链接其他库最好(但如果必要我可以)。那么我需要手动进行字符串替换吗?如果需要,我如何使用捕获组?


你指的是哪些捕获组?C语言没有内置的正则表达式库,你需要使用pcre或类似的库。 - lvella
那你想让我具体说明吗?我有一个HTTP请求头,想用正则表达式进行更改并转发到服务器。 - Yifan
2
这可能会有帮助:http://www.linuxquestions.org/questions/programming-9/replace-a-substring-with-another-string-in-c-170076/#post877511 - Alvin K.
尝试访问 http://www.daniweb.com/software-development/c/code/216955 - tolitius
你正在使用动态正则表达式吗?如果不是,有很多工具在这些年里已经出现来进行静态解析。re2c非常优秀。 - Neil
显示剩余2条评论
4个回答

24

regex.h没有原生的字符串替换功能,但是它提供了子表达式/捕获组,使得操作变得更加容易。我假设你已经熟悉正则表达式编译并跳过到正则表达式执行和子表达式部分。

在regex.h(/usr/include/)中,regexec()定义如下:

extern int regexec (const regex_t *__restrict __preg,
        const char *__restrict __string, size_t __nmatch,
        regmatch_t __pmatch[__restrict_arr],
        int __eflags);

第一个、第二个和最后一个参数分别是正则表达式、要执行的字符串和执行标志。第三个和第四个参数用于指定regmatch_t数组。regmatch_t由两个字段rm_so和rm_eo组成,它们分别是匹配区域的开始和结束的索引或偏移量。然后可以使用这些索引与string.h中的memcpy(), memset()memmove()一起进行字符串替换。
我会举个小例子并稍后发布。
祝你好运,希望这有所帮助。

12
我会提供一个小例子,并稍后发布。何时?你在5年前就做出了这个承诺。 ;( 这里是我到目前为止找到的一些示例。 - patryk.beza

6
PCRE库本身并不提供替换功能,但是在PCRE下载页面上有一个包装函数可用,该函数接受Perl风格的=~ s/pattern/replace/语法,然后使用PCRE本机函数为您进行替换操作。访问http://www.pcre.org/,然后单击下载链接:ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/,然后进入Contrib目录。您需要的包/项目是:pcrs-0.0.3-src.tar.gz
请注意,我自己没有使用过这个函数,因此无法证明其有效性。然而,这是一个相当小而简单的代码片段,所以它可能会很好地满足您的需求。

嗨,谢谢提供链接。但是我该如何在我的程序中使用pcrs?我目前使用的是pcre_compile和pcre_exec函数,就像这里所示https://dev59.com/hUnSa4cB1Zd3GeqPNFhf#1421923 - user13107

3

我已经接手@marnout的帖子,并进行了修复以解决一些错误和拼写错误。修复了以下问题:内存泄漏、如果替换内容包含模式则无限替换、在函数中打印被替换的值而非返回值、回溯引用值实际上高达31、文档说明、更多测试示例。

/* regex_replace.c
:w | !gcc % -o .%<
:w | !gcc % -o .%< && ./.%<
:w | !gcc % -o .%< && valgrind -v ./.%<
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

int regex_replace(char **str, const char *pattern, const char *replace) {
    // replaces regex in pattern with replacement observing capture groups
    // *str MUST be free-able, i.e. obtained by strdup, malloc, ...
    // back references are indicated by char codes 1-31 and none of those chars can be used in the replacement string such as a tab.
    // will not search for matches within replaced text, this will begin searching for the next match after the end of prev match
    // returns:
    //   -1 if pattern cannot be compiled
    //   -2 if count of back references and capture groups don't match
    //   otherwise returns number of matches that were found and replaced
    //
    regex_t reg;
    unsigned int replacements = 0;
    // if regex can't commpile pattern, do nothing
    if(!regcomp(&reg, pattern, REG_EXTENDED)) {
        size_t nmatch = reg.re_nsub;
        regmatch_t m[nmatch + 1];
        const char *rpl, *p;
        // count back references in replace
        int br = 0;
        p = replace;
        while(1) {
            while(*++p > 31);
            if(*p) br++;
            else break;
        } // if br is not equal to nmatch, leave
        if(br != nmatch) {
            regfree(&reg);
            return -2;
        }
        // look for matches and replace
        char *new;
        char *search_start = *str;
        while(!regexec(&reg, search_start, nmatch + 1, m, REG_NOTBOL)) {
            // make enough room
            new = (char *)malloc(strlen(*str) + strlen(replace));
            if(!new) exit(EXIT_FAILURE);
            *new = '\0';
            strncat(new, *str, search_start - *str);
            p = rpl = replace;
            int c;
            strncat(new, search_start, m[0].rm_so); // test before pattern
            for(int k=0; k<nmatch; k++) {
                while(*++p > 31); // skip printable char
                c = *p;  // back reference (e.g. \1, \2, ...)
                strncat(new, rpl, p - rpl); // add head of rpl
                // concat match
                strncat(new, search_start + m[c].rm_so, m[c].rm_eo - m[c].rm_so);
                rpl = p++; // skip back reference, next match
            }
            strcat(new, p ); // trailing of rpl
            unsigned int new_start_offset = strlen(new);
            strcat(new, search_start + m[0].rm_eo); // trailing text in *str
            free(*str);
            *str = (char *)malloc(strlen(new)+1);
            strcpy(*str,new);
            search_start = *str + new_start_offset;
            free(new);
            replacements++;
        }
        regfree(&reg);
        // ajust size
        *str = (char *)realloc(*str, strlen(*str) + 1);
        return replacements;
    } else {
        return -1;
    }
}

const char test1[] = "before [link->address] some text [link2->addr2] trail[a->[b->c]]";
const char *pattern1 = "\\[([^-]+)->([^]]+)\\]";
const char replace1[] = "<a href=\"\2\">\1</a>";

const char test2[] = "abcabcdefghijklmnopqurstuvwxyzabc";
const char *pattern2 = "abc";
const char replace2[] = "!abc";

const char test3[] = "a1a1a1a2ba1";
const char *pattern3 = "a";
const char replace3[] = "aa";
int main(int argc, char *argv[])
{
    char *str1 = (char *)malloc(strlen(test1)+1);
    strcpy(str1,test1);
    puts(str1);
    printf("test 1 Before: [%s], ",str1);
    unsigned int repl_count1 = regex_replace(&str1, pattern1, replace1);
    printf("After replacing %d matches: [%s]\n",repl_count1,str1);
    free(str1);

    char *str2 = (char *)malloc(strlen(test2)+1);
    strcpy(str2,test2);
    puts(str2);
    printf("test 2 Before: [%s], ",str2);
    unsigned int repl_count2 = regex_replace(&str2, pattern2, replace2);
    printf("After replacing %d matches: [%s]\n",repl_count2,str2);
    free(str2);

    char *str3 = (char *)malloc(strlen(test3)+1);
    strcpy(str3,test3);
    puts(str3);
    printf("test 3 Before: [%s], ",str3);
    unsigned int repl_count3 = regex_replace(&str3, pattern3, replace3);
    printf("After replacing %d matches: [%s]\n",repl_count3,str3);
    free(str3);
}

new = (char *)malloc(strlen(*str) + strlen(replace)) does not look correct. say, pattern = "(hello)(world)" and replace = "\1\2\1\2" - pynexj

0
/* regex_replace.c
   :w | !gcc % -o .%<
   :w | !gcc % -o .%< && ./.%<
 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

void  // *str MUST can be freed, i.e. obtainde by strdup, malloc, ...
regex_replace(char **str, const char *pattern, const char *replace) {
    regex_t reg;
    // if regex can't commpile pattern, do nothing
    if(!regcomp(&reg, pattern, REG_EXTENDED)) {
    size_t nmatch = reg.re_nsub; 
    regmatch_t m[nmatch + 1];
    const char *rpl, *p;
    // count back references in replace
    int br = 0;
    p = replace;
    while(1) { 
        while(*++p > 31); 
        if(*p) br++; 
        else break;
    } // if br is not equal to nmatch, leave
    if(br != nmatch) return;
    // look for matches and replace
    char *new;
    while(!regexec(&reg, *str, nmatch + 1, m, REG_NOTBOL)) {
        // make enough room
        new = (char *)malloc(strlen(*str) + strlen(rpl));
        if(!new) exit(EXIT_FAILURE);
        *new = 0;
        p = rpl = replace;
        int c;
        strncat(new, *str, m[0].rm_so); // test before pattern
        for(int k=0; k<nmatch; k++) {
        while(*++p > 16); // skip printable char
        c = *p;  // back referenc (e.g. \1, \2, ...)
        strncat(new, rpl, p - rpl); // add head of rpl
        // concat match
        strncat(new, *str + m[c].rm_so, m[c].rm_eo - m[c].rm_so);
        rpl = p++; // skip back reference, next match
        }
        strcat(new, p ); // trailing of rpl
        strcat(new, *str + m[0].rm_eo); // trainling text in *str
        free(*str);
        *str = strdup(new);
        free(new);
    }
    // ajust size
    *str = (char *)realloc(*str, strlen(*str) + 1);
    } else
    printf("Could not compile regex: %s\n", replace);
}

int main(int argc, char *argv[]) 
{
    char *pattern = "\\[([^-]+)->([^]]+)\\]";
    char *str = strdup("before [link->address] some text [link2->addr2] trail");
    char rpl[] = "<a href=\"\2\">\1</a>";
    puts(str);
    regex_replace(&str, pattern, rpl);
    puts(str);
    free(str);
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接