C ++多字符串匹配而不使用正则表达式

时间:2016-04-01 15:19:49

标签: c++ regex string

我正在尝试编辑现有的C ++程序,以便在文本中搜索多个字符串而不是单个字符串。我是一个完整的C ++菜鸟,所以我希望有人可以指出一个能够正常工作并且快速的功能。我尝试使用正则表达式(see my earlier post),这可行,但正则表达式对我的目的来说太慢了。请使用以下代码(由@WiktorStribiżew提供):

#include <iostream>
#include <regex>
using namespace std;

int main() {
    std::string pattern("A|D");         // Regex expression
    std::regex rx(pattern);             // Getting the regex object 

    std::string s("ABCDEABCABD");       // Defining the string input
    std::ptrdiff_t number_of_matches = std::distance(  // Count the number of matches inside the iterator
        std::sregex_iterator(s.begin(), s.end(), rx),
        std::sregex_iterator());

    std::cout << number_of_matches << std::endl;  // Displaying results
    return 0;
}

这会输出想要的答案(5)但是我的目的太慢了(在~100 + GB文件中搜索每一行)。我怎样才能让它更快?从文件中读取输入主题字符串 s ,并在命令行上提供查询字符串 pattern ,因此它以字符串形式出现(由管道分隔为A | D) ,例如)。

1 个答案:

答案 0 :(得分:4)

正则表达式和表现相互咬合 我重写了你的例子,不使用Regex,并编译了两个版本(在OS X上,使用带有-O3 -fwhole-program的g ++ 5.3.0):

#include <iostream>
using namespace std;

#define NUM_PATTERNS 2

int main() {
    std::string patterns[] = {std::string("A"), std::string("D")};
    std::string str("ABCDEABCABD");
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(int pos = 0; ; )
        {
            pos = str.find(patterns[i], pos);
            if(pos == std::string::npos)
            {
                break;
            }
            ++pos;      // in order to not match again
            ++count;
        }
    }

    std::cout << count << std::endl;  // Displaying results
    return 0;
}

生成的程序集为545行:

    .cstring
    .align 3
LC0:
    .ascii "basic_string::_M_construct null not valid\0"
    .section __TEXT,__text_cold,regular,pure_instructions
    .align 1
LCOLDB1:
    .text
LHOTB1:
    .align 1,0x90
    .align 4,0x90
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18:
LFB1645:
    pushq   %r13
LCFI0:
    pushq   %r12
LCFI1:
    leaq    16(%rdi), %r12
    pushq   %rbp
LCFI2:
    pushq   %rbx
LCFI3:
    subq    $24, %rsp
LCFI4:
    testq   %rsi, %rsi
    movq    %r12, (%rdi)
    je  L2
    movq    %rdi, %rbx
    movq    %rsi, %rdi
    movq    %rsi, %r13
    call    _strlen
    cmpq    $15, %rax
    movq    %rax, %rbp
    movq    %rax, 8(%rsp)
    ja  L13
    cmpq    $1, %rax
    je  L14
    testq   %rax, %rax
    jne L15
L6:
    movq    8(%rsp), %rax
    movq    (%rbx), %rdx
    movq    %rax, 8(%rbx)
    movb    $0, (%rdx,%rax)
    addq    $24, %rsp
LCFI5:
    popq    %rbx
LCFI6:
    popq    %rbp
LCFI7:
    popq    %r12
LCFI8:
    popq    %r13
LCFI9:
    ret
L13:
LCFI10:
    leaq    8(%rsp), %rsi
    xorl    %edx, %edx
    movq    %rbx, %rdi
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm
    movq    8(%rsp), %rdx
    movq    %rax, (%rbx)
    movq    %rax, %rdi
    movq    %rdx, 16(%rbx)
L4:
    movq    %rbp, %rdx
    movq    %r13, %rsi
    call    _memcpy
    jmp L6
L14:
    movzbl  0(%r13), %eax
    movb    %al, 16(%rbx)
    jmp L6
L2:
    leaq    LC0(%rip), %rdi
    call    __ZSt19__throw_logic_errorPKc
L15:
    movq    %r12, %rdi
    jmp L4
LFE1645:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
    .text
LHOTE1:
    .cstring
LC2:
    .ascii "A\0"
LC3:
    .ascii "D\0"
LC4:
    .ascii "ABCDEABCABD\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB5:
    .align 4
    .globl _main
_main:
LFB1425:
    pushq   %r15
LCFI11:
    leaq    LC2(%rip), %rsi
    pushq   %r14
LCFI12:
    pushq   %r13
LCFI13:
    pushq   %r12
LCFI14:
    pushq   %rbp
LCFI15:
    pushq   %rbx
LCFI16:
    subq    $104, %rsp
LCFI17:
    leaq    32(%rsp), %rbp
    movq    %rbp, %rdi
LEHB0:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE0:
    leaq    32(%rbp), %rdi
    leaq    LC3(%rip), %rsi
LEHB1:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE1:
    leaq    LC4(%rip), %rsi
    movq    %rsp, %rdi
    movq    %rsp, %r12
LEHB2:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE2:
    leaq    8(%rbp), %r14
    xorl    %r15d, %r15d
    xorl    %ebx, %ebx
L22:
    leaq    (%r15,%r14), %r13
    xorl    %eax, %eax
    jmp L21
    .align 4
L44:
    addl    $1, %eax
    addl    $1, %ebx
L21:
    movq    0(%rbp,%r15), %rsi
    movslq  %eax, %rdx
    movq    %r12, %rdi
    movq    0(%r13), %rcx
    call    __ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE4findEPKcmm
    cmpl    $-1, %eax
    jne L44
    addq    $32, %r15
    cmpq    $64, %r15
    jne L22
    movq    __ZSt4cout@GOTPCREL(%rip), %rdi
    movl    %ebx, %esi
LEHB3:
    call    __ZNSo9_M_insertImEERSoT_
    movq    %rax, %rdi
    call    __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
LEHE3:
    movq    (%rsp), %rdi
    addq    $16, %r12
    cmpq    %r12, %rdi
    je  L23
    call    __ZdlPv
L23:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L24
    call    __ZdlPv
L24:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L39
    call    __ZdlPv
L39:
    addq    $104, %rsp
LCFI18:
    xorl    %eax, %eax
    popq    %rbx
LCFI19:
    popq    %rbp
LCFI20:
    popq    %r12
LCFI21:
    popq    %r13
LCFI22:
    popq    %r14
LCFI23:
    popq    %r15
LCFI24:
    ret
L38:
LCFI25:
    movq    %rax, %r12
    movl    $1, %eax
L19:
    movl    $1, %ebx
    subq    %rax, %rbx
    salq    $5, %rbx
    addq    %rbp, %rbx
L29:
    cmpq    %rbp, %rbx
    je  L27
    subq    $32, %rbx
    movq    (%rbx), %rdi
    leaq    16(%rbx), %rax
    cmpq    %rax, %rdi
    je  L29
    call    __ZdlPv
    jmp L29
L36:
    movq    (%rsp), %rdi
    addq    $16, %r12
    movq    %rax, %rbx
    cmpq    %r12, %rdi
    je  L32
    call    __ZdlPv
L32:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L33
    call    __ZdlPv
L33:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L34
    call    __ZdlPv
L34:
    movq    %rbx, %rdi
LEHB4:
    call    __Unwind_Resume
L27:
    movq    %r12, %rdi
    call    __Unwind_Resume
LEHE4:
L37:
    movq    %rax, %rbx
    jmp L32
L35:
    movq    %rax, %r12
    xorl    %eax, %eax
    jmp L19
LFE1425:
    .section __DATA,__gcc_except_tab
GCC_except_table0:
LLSDA1425:
    .byte   0xff
    .byte   0xff
    .byte   0x3
    .byte   0x41
    .set L$set$0,LEHB0-LFB1425
    .long L$set$0
    .set L$set$1,LEHE0-LEHB0
    .long L$set$1
    .set L$set$2,L38-LFB1425
    .long L$set$2
    .byte   0
    .set L$set$3,LEHB1-LFB1425
    .long L$set$3
    .set L$set$4,LEHE1-LEHB1
    .long L$set$4
    .set L$set$5,L35-LFB1425
    .long L$set$5
    .byte   0
    .set L$set$6,LEHB2-LFB1425
    .long L$set$6
    .set L$set$7,LEHE2-LEHB2
    .long L$set$7
    .set L$set$8,L37-LFB1425
    .long L$set$8
    .byte   0
    .set L$set$9,LEHB3-LFB1425
    .long L$set$9
    .set L$set$10,LEHE3-LEHB3
    .long L$set$10
    .set L$set$11,L36-LFB1425
    .long L$set$11
    .byte   0
    .set L$set$12,LEHB4-LFB1425
    .long L$set$12
    .set L$set$13,LEHE4-LEHB4
    .long L$set$13
    .long   0
    .byte   0
    .section __TEXT,__text_startup,regular,pure_instructions
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE5:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB6:
    .align 4
__GLOBAL__sub_I_test1.cpp:
LFB1626:
    leaq    __ZStL8__ioinit(%rip), %rdi
    subq    $8, %rsp
LCFI26:
    call    __ZNSt8ios_base4InitC1Ev
    movq    __ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
    leaq    ___dso_handle(%rip), %rdx
    addq    $8, %rsp
LCFI27:
    leaq    __ZStL8__ioinit(%rip), %rsi
    jmp ___cxa_atexit
LFE1626:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE6:
    .static_data
__ZStL8__ioinit:
    .space  1
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$14,LECIE1-LSCIE1
    .long L$set$14
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zPLR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x7
    .byte   0x9b
    .long   ___gxx_personality_v0+4@GOTPCREL
    .byte   0x10
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$15,LEFDE1-LASFDE1
    .long L$set$15
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1645-.
    .set L$set$16,LFE1645-LFB1645
    .quad L$set$16
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$17,LCFI0-LFB1645
    .long L$set$17
    .byte   0xe
    .byte   0x10
    .byte   0x8d
    .byte   0x2
    .byte   0x4
    .set L$set$18,LCFI1-LCFI0
    .long L$set$18
    .byte   0xe
    .byte   0x18
    .byte   0x8c
    .byte   0x3
    .byte   0x4
    .set L$set$19,LCFI2-LCFI1
    .long L$set$19
    .byte   0xe
    .byte   0x20
    .byte   0x86
    .byte   0x4
    .byte   0x4
    .set L$set$20,LCFI3-LCFI2
    .long L$set$20
    .byte   0xe
    .byte   0x28
    .byte   0x83
    .byte   0x5
    .byte   0x4
    .set L$set$21,LCFI4-LCFI3
    .long L$set$21
    .byte   0xe
    .byte   0x40
    .byte   0x4
    .set L$set$22,LCFI5-LCFI4
    .long L$set$22
    .byte   0xa
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$23,LCFI6-LCFI5
    .long L$set$23
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$24,LCFI7-LCFI6
    .long L$set$24
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$25,LCFI8-LCFI7
    .long L$set$25
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$26,LCFI9-LCFI8
    .long L$set$26
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$27,LCFI10-LCFI9
    .long L$set$27
    .byte   0xb
    .align 3
LEFDE1:
LSFDE3:
    .set L$set$28,LEFDE3-LASFDE3
    .long L$set$28
LASFDE3:
    .long   LASFDE3-EH_frame1
    .quad   LFB1425-.
    .set L$set$29,LFE1425-LFB1425
    .quad L$set$29
    .byte   0x8
    .quad   LLSDA1425-.
    .byte   0x4
    .set L$set$30,LCFI11-LFB1425
    .long L$set$30
    .byte   0xe
    .byte   0x10
    .byte   0x8f
    .byte   0x2
    .byte   0x4
    .set L$set$31,LCFI12-LCFI11
    .long L$set$31
    .byte   0xe
    .byte   0x18
    .byte   0x8e
    .byte   0x3
    .byte   0x4
    .set L$set$32,LCFI13-LCFI12
    .long L$set$32
    .byte   0xe
    .byte   0x20
    .byte   0x8d
    .byte   0x4
    .byte   0x4
    .set L$set$33,LCFI14-LCFI13
    .long L$set$33
    .byte   0xe
    .byte   0x28
    .byte   0x8c
    .byte   0x5
    .byte   0x4
    .set L$set$34,LCFI15-LCFI14
    .long L$set$34
    .byte   0xe
    .byte   0x30
    .byte   0x86
    .byte   0x6
    .byte   0x4
    .set L$set$35,LCFI16-LCFI15
    .long L$set$35
    .byte   0xe
    .byte   0x38
    .byte   0x83
    .byte   0x7
    .byte   0x4
    .set L$set$36,LCFI17-LCFI16
    .long L$set$36
    .byte   0xe
    .byte   0xa0,0x1
    .byte   0x4
    .set L$set$37,LCFI18-LCFI17
    .long L$set$37
    .byte   0xa
    .byte   0xe
    .byte   0x38
    .byte   0x4
    .set L$set$38,LCFI19-LCFI18
    .long L$set$38
    .byte   0xe
    .byte   0x30
    .byte   0x4
    .set L$set$39,LCFI20-LCFI19
    .long L$set$39
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$40,LCFI21-LCFI20
    .long L$set$40
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$41,LCFI22-LCFI21
    .long L$set$41
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$42,LCFI23-LCFI22
    .long L$set$42
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$43,LCFI24-LCFI23
    .long L$set$43
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$44,LCFI25-LCFI24
    .long L$set$44
    .byte   0xb
    .align 3
LEFDE3:
LSFDE5:
    .set L$set$45,LEFDE5-LASFDE5
    .long L$set$45
LASFDE5:
    .long   LASFDE5-EH_frame1
    .quad   LFB1626-.
    .set L$set$46,LFE1626-LFB1626
    .quad L$set$46
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$47,LCFI26-LFB1626
    .long L$set$47
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$48,LCFI27-LCFI26
    .long L$set$48
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE5:
    .mod_init_func
    .align 3
    .quad   __GLOBAL__sub_I_test1.cpp
    .constructor
    .destructor
    .align 1
    .subsections_via_symbols

您示例中的程序集是31行和559行,并且尝试在此处或在Pastebin上发布它会导致我的浏览器选项卡崩溃。

不过,我还写了上述程序的C版:

#include <stdio.h>
#include <string.h>

#define NUM_PATTERNS 2

int main()
{
    const char *patterns[] = {"A", "D"};
    char *str = "ABCDEABCABD";
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(char *s = str; ; )
        {
            s = strstr(s, patterns[i]);
            if(s == NULL)
            {
                break;
            }
            ++s;        // in order to not match again
            ++count;
        }
    }
    printf("%u\n", count);
    return 0;
}

编译只有95行汇编:

    .cstring
LC0:
    .ascii "ABCDEABCABD\0"
LC1:
    .ascii "%u\12\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB2:
    .align 4
    .globl _main
_main:
LFB1:
    pushq   %rbx
LCFI0:
    leaq    LC0(%rip), %rdi
    xorl    %ebx, %ebx
    jmp L3
    .align 4
L8:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L3:
    movl    $65, %esi
    call    _strchr
    testq   %rax, %rax
    jne L8
    leaq    LC0(%rip), %rdi
    jmp L2
    .align 4
L9:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L2:
    movl    $68, %esi
    call    _strchr
    testq   %rax, %rax
    jne L9
    leaq    LC1(%rip), %rdi
    movl    %ebx, %esi
    xorl    %eax, %eax
    call    _printf
    xorl    %eax, %eax
    popq    %rbx
LCFI1:
    ret
LFE1:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE2:
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$0,LECIE1-LSCIE1
    .long L$set$0
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x1
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$1,LEFDE1-LASFDE1
    .long L$set$1
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1-.
    .set L$set$2,LFE1-LFB1
    .quad L$set$2
    .byte   0
    .byte   0x4
    .set L$set$3,LCFI0-LFB1
    .long L$set$3
    .byte   0xe
    .byte   0x10
    .byte   0x83
    .byte   0x2
    .byte   0x4
    .set L$set$4,LCFI1-LCFI0
    .long L$set$4
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE1:
    .subsections_via_symbols

我建议你至少将代码的那一部分转换为C.如果你有一个周围的C ++代码库,嵌入它不应该是一个太大的问题。

无论哪种方式,您都可以采用上述示例之一,将str更改为从文件中读取,将patterns数组更改为所需数量的模式(确保设置{{1}如果这仍然不足以提高性能,那么实现循环就可以跨多个线程进行分割(虽然要覆盖C和C ++,但这个主题太宽泛了。)这个答案)。

您当然也可以将命令行参数解析为模式数组:

NUM_PATTERNS

如果您在此处将C示例编译为C ++,请确保将所有包含指令从#include <stdlib.h> #include <string.h> int numPatterns(char *str) { int num = 1; for(int i = 0, len = strlen(str); i < len; ++i) // loop through all characters of the string { if(str[i] == '|') // increase the counter each time we hit a | { ++num; } } return num; } void splitPatterns(char *str, int num, char** dst) { char *raw = (char*)(dst + num * sizeof(char*)); // this is the region after the pointers strcpy(raw, str); // first copy the string (note, this function is dangerous if used on anything other than strings! (google buffer overflow)) dst[0] = raw; for(int i = 0, len = strlen(str), n = 1; i < len; ++i) { if(raw[i] == '|') { raw[i] = '\0'; // replace all | by '\0' in order to split dst[n++] = &raw[i + 1]; // and let the next string start at the next character } } } // let's say your program is called as ./program 'input_file' 'list|of|patterns' int main(int argc, char **argv) { /* check that argc >= 3 and blah */ int num = numPatterns(argv[2]); // In order to malloc() and free() only once, I'm stashing things together here. // First comes an array of num pointers, hence num * sizeof(char*), followed by // the strings which take up the same space as argv[2] (the | just gets // replaced by \0), which equals strlen(argv[2]) + 1 for the terminating null byte. char **patterns = (char**)malloc(num * sizeof(char*) + strlen(argv[2]) + 1); splitPatterns(argv[2], num, patterns); /* work with patterns here */ free(patterns); } 更改为<*.h>,即<c*>更改为<stdio.h>等。