提升regex_search引发的regex error_stack异常

时间:2018-10-27 07:51:18

标签: c++ regex boost

当我尝试使用boost 1.68正则表达式搜索带有模式的内容时,我写道:

#include <iostream>
#include <fstream>
#include <sstream>
#include <iterator>
#include <string>
#include <boost/regex.hpp>

int main(int argc, char** argv)
{
    const std::string pattern("\
(?(DEFINE)(?'NAMESPACE'\\w*::))(?#r)\
(?(DEFINE)(?'CONSTANT'(\"(?:[^\"\\\\]|\\\\.)*\")|(\\d+\\.?\\d*f?)))(?#r)\
(?(DEFINE)(?'VARIABLE'(?P>NAMESPACE)*([A-Za-z_]\\w*\\.)*[A-Za-z_]\\w*))(?#r)\
(?(DEFINE)(?'OPERAND'(\\+|-)*((?P>VARIABLE)|(?P>CONSTANT))))(?#r)\
(?(DEFINE)(?'EXPRESSION'\\s*(?P>OPERAND)\\s*(\\s*[\\*\\+-\\/]\\s*(?P>OPERAND))*))(?#r)\
(?(DEFINE)(?'ARGUMENTS'(?P>EXPRESSION)(,\\s*(?P>EXPRESSION))*))(?#r)\
(?(DEFINE)(?'FUNCTION_CALL'(?P>VARIABLE)\\(\\s*(?P>ARGUMENTS)?\\s*\\)))(?#r)\
(?P>FUNCTION_CALL)");
    std::cout << "pattern: " << pattern << std::endl;
    boost::regex simple_function(pattern, boost::regex_constants::perl);

    std::ifstream file("flask");
    if (file.is_open()) {
        std::string context((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
        boost::smatch results;
        boost::match_flag_type flags = boost::match_default | boost::match_single_line;
        auto start = context.cbegin();
        auto end = context.cend();
        int line_n = 0;
        try {
            while (start < end && boost::regex_search(start, end, results, simple_function, flags)) {
                std::cout << '#' << line_n++ << ' ';
                std::cout << results[0] << std::endl;
                start = (results[0].length() == 0) ? results[0].first + 1 : results[0].second;
            }
        }
        catch (...) {
            std::cout << "exception throwed." << std::endl;
        }
    }
    return 0;
}

,它将在内存位置boost::exception_detail::clone_impl<boost::exception_detail::error_info_injector<std::runtime_error>>中抛出:0x00000073378FE638。这是error_stackwhat i saw

但是我不知道为什么,我用相同的regex101和模式在context和regextester上测试了我的表达式,只有我的程序会失败并抛出异常。我做错了什么还是误解了使用boost regex的方式?有什么办法可以避免error_stack

1 个答案:

答案 0 :(得分:1)

有趣。为此,我不得不学习正则表达式的全新领域。为此表示敬意。

问题是Recursive Expressions。您需要非常确定,模式不会轻易自由地递归,因此您不会陷入无限递归,或者就像这里所说的那样,“公正”递归在输入很长的情况下会很容易地深入。 >

所以,首先我整理一下:

const std::string pattern(
        R"((?(DEFINE)(?'NAMESPACE'\w*::)))"
        R"((?(DEFINE)(?'CONSTANT'("(?:[^"\\]|\\.)*")|(\d+\.?\d*f?))))"
        R"((?(DEFINE)(?'VARIABLE'(?P>NAMESPACE)*([A-Za-z_]\w*\.)*[A-Za-z_]\w*)))"
        R"((?(DEFINE)(?'OPERAND'(\+|-)*((?P>VARIABLE)|(?P>CONSTANT)))))"
        R"((?(DEFINE)(?'EXPRESSION'\s*(?P>OPERAND)\s*(\s*[\*\+-\/]\s*(?P>OPERAND))*)))"
        R"((?(DEFINE)(?'ARGUMENTS'(?P>EXPRESSION)(,\s*(?P>EXPRESSION))*)))"
        R"((?(DEFINE)(?'FUNCTION_CALL'(?P>VARIABLE)\(\s*(?P>ARGUMENTS)?\s*\))))"
        R"((?P>FUNCTION_CALL))");

现在,我开始“理解”模式,我决定不使用正则表达式进行语法¹,然后将其重写为Spirit X3:

namespace rules {
    using namespace x3;

    auto WORD            = (alnum | char_('_'));
    auto NAMESPACE       = +WORD >> "::";
    auto CONSTANT        = ( lexeme [ '"' >> *~char_('"') >> '"' ] | double_ );
    auto ident           = lexeme [ char_("A-Za-z_") >> *WORD ];
    auto VARIABLE        = *NAMESPACE >> ident % '.';
    auto OPERAND         = *(char_("+-")) >> (VARIABLE | CONSTANT);
    auto EXPRESSION      = OPERAND % char_("*+/-");
    auto ARGUMENTS       = EXPRESSION % ',';
    auto FUNCTION_CALL   = VARIABLE >> '(' >> -ARGUMENTS >> ')';

    auto simple_function = rule<struct simple_function_, std::string> {"simple_function"}
                         = skip(space) [ x3::raw[FUNCTION_CALL] ];
}

现在,由于在相关性更高的位置接受了空格(skiplexeme²),因此此方法更加准确。而且,它显然没有遭受严重的回溯问题,显然:

Live On Wandbox

#include <iostream>
#include <fstream>
#include <sstream>
#include <iterator>
#include <string>
#include <boost/regex.hpp>
#include <boost/spirit/home/x3.hpp>
namespace x3 = boost::spirit::x3;

namespace rules {
    using namespace x3;

    auto WORD            = (alnum | char_('_'));
    auto NAMESPACE       = +WORD >> "::";
    auto CONSTANT        = ( lexeme [ '"' >> *~char_('"') >> '"' ] | double_ );
    auto ident           = lexeme [ char_("A-Za-z_") >> *WORD ];
    auto VARIABLE        = *NAMESPACE >> ident % '.';
    auto OPERAND         = *(char_("+-")) >> (VARIABLE | CONSTANT);
    auto EXPRESSION      = OPERAND % char_("*+/-");
    auto ARGUMENTS       = EXPRESSION % ',';
    auto FUNCTION_CALL   = VARIABLE >> '(' >> -ARGUMENTS >> ')';

    auto simple_function = rule<struct simple_function_, std::string> {"simple_function"}
                         = skip(space) [ x3::raw[FUNCTION_CALL] ];
}

int main()
{
    std::ifstream file("flask");
    std::string const context(std::istreambuf_iterator<char>(file), {});

    std::vector<std::string> calls;
    parse(context.begin(), context.end(), *x3::seek[rules::simple_function], calls);

    for (auto& call : calls) {
        std::cout << call << "\n";
    }
}

哪些印刷品

anno::copyright_notice("XXXXX")
anno::author("Someone")
anno::contributor("")
state::texture_coordinate(0)
state::texture_tangent_u(0)
state::texture_tangent_v(0)

¹我知道Perl6很棒,但是仍然

²Boost spirit skipper issues

更新/奖励

仅显示与Spirit X3匹配的文本之外的内容,这是对快速端口的一点改进,该端口显示了如何使用相同的规则来解析为强类型的AST数据类型。

所做的更改:

  • 修复了在命名空间限定符中没有lexeme标识符的错误
  • 同时使标识符解析与名称空间一致(很可能名称空间名称也不能以数字字符开头)
  • 解析为强类型数据类型AST::VariableAST::Literal(用于字符串或数字文字)和AST::FunctionCall
  • 支持字符串文字内的转义。这意味着"A\"B"现在将被正确解析为包含AST::Literal的{​​{1}}。
  • 如果您检查调试输出(A"B),您实际上可以看到已解析这些文字

Live On Wandbox

#define BOOST_SPIRIT_X3_DEBUG

同时打印“源”解析和“ AST”解析:

//#define BOOST_SPIRIT_X3_DEBUG
#include <iostream>
#include <fstream>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
namespace x3 = boost::spirit::x3;

namespace AST {
    struct Variable {
        std::vector<std::string> namespaces, nested_objects;

        friend std::ostream& operator<<(std::ostream& os, Variable const& v) {
            for (auto ns : v.namespaces)
                os << '[' << ns << "]::";
            bool first = true;
            for (auto obj : v.nested_objects) {
                os << (first?"":".") << '[' << obj << ']';
                first = false;
            }
            return os;
        }
    };

    using Literal = boost::variant<std::string, double>;

    struct FunctionCall {
        Variable name;
        std::vector<std::string> arguments;
    };
}

BOOST_FUSION_ADAPT_STRUCT(AST::Variable, namespaces, nested_objects)
BOOST_FUSION_ADAPT_STRUCT(AST::FunctionCall, name, arguments)

namespace rules {
    using namespace x3;

    auto ident           = rule<struct ident_, std::string> {"ident"}
                         = lexeme [ raw [ (alpha|'_') >> *(alnum|'_') ] ];
    auto namespace_      = rule<struct namespace_, std::string> {"namespace_"}
                         = ident >> "::";
    auto quoted_str      = rule<struct quoted_str_, std::string> {"quoted_str"}
                         = lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
    auto constant        = rule<struct constant_, AST::Literal> {"constant"}
                         = quoted_str | double_;
    auto variable        = rule<struct variable_, AST::Variable> {"variable"}
                         = *namespace_ >> ident % '.';
    auto operand         = rule<struct operand_> {"operand"}
                         = *char_("+-") >> (variable | constant);
    auto expression      = rule<struct expression_, std::string> {"expression"}
                         = raw [ operand % char_("*+/-") ];
    auto arguments       = expression % ',';
    auto function_call   = rule<struct function_call_, AST::FunctionCall> {"function_call"}
                         = variable >> '(' >> -arguments >> ')';

    auto simple_function = skip(space) [ function_call ];
}

int main()
{
    // parsing the raw sources out as string
    {
        std::ifstream file("flask");
        boost::spirit::istream_iterator f(file), l;

        std::vector<std::string> src;
        parse(f, l, *x3::seek[x3::raw[rules::simple_function]], src);

        for (auto& call : src)
            std::cout << call << "\n";
    }

    // parsing AST::FunctionCall objects
    {
        std::ifstream file("flask");
        boost::spirit::istream_iterator f(file), l;

        std::vector<AST::FunctionCall> parsed;
        parse(f, l, *x3::seek[rules::simple_function], parsed);

        for (auto& call : parsed) {
            std::cout << call.name << "\n";
            for (auto& argument : call.arguments)
                std::cout << " - argument: " << argument << "\n";
        }
    }
}