Spirit X3:具有内部状态的解析器

时间:2018-06-12 16:31:37

标签: c++ c++17 boost-spirit boost-spirit-x3

我想有效地解析大型类CSV文件,这些文件的列顺序是我在运行时得到的。使用Spirit Qi,我将使用lazy辅助解析器解析每个字段,该解析器将在运行时选择要应用于每列的特定于列的解析器。但是X3似乎没有lazy(尽管它是listed in documentation)。在阅读SO的建议之后,我决定编写一个自定义解析器。

它结果非常好,但现在我注意到我并不需要将pos变量暴露在自定义解析器本身之外的任何地方。我已经尝试将它放入自定义解析器本身并开始收到编译器错误,指出column_value_parser对象是只读的。我可以以某种方式将pos放入解析器结构吗?

获得编译时错误的简化代码,注释掉了我的工作版本部分:

#include <iostream>
#include <variant>

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/support.hpp>

namespace helpers {
    // https://bitbashing.io/std-visit.html
    template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
    template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
}

auto const unquoted_text_field = *(boost::spirit::x3::char_ - ',' - boost::spirit::x3::eol);

struct text { };
struct integer { };
struct real { };
struct skip { };
typedef std::variant<text, integer, real, skip> column_variant;

struct column_value_parser : boost::spirit::x3::parser<column_value_parser> {
    typedef boost::spirit::unused_type attribute_type;

    std::vector<column_variant>& columns;
    // size_t& pos;
    size_t pos;

    // column_value_parser(std::vector<column_variant>& columns, size_t& pos)
    column_value_parser(std::vector<column_variant>& columns)
        : columns(columns)
    //    , pos(pos)
        , pos(0)
    { }

    template<typename It, typename Ctx, typename Other, typename Attr>
    bool parse(It& f, It l, Ctx& ctx, Other const& other, Attr& attr) const {
        auto const saved_f = f;
        bool successful = false;

        visit(
            helpers::overloaded {
                [&](skip const&) {
                    successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::omit[unquoted_text_field]);
                },
                [&](text& c) {
                    std::string value;
                    successful = boost::spirit::x3::parse(f, l, unquoted_text_field, value);
                    if(successful) {
                        std::cout << "Text: " << value << '\n';
                    }
                },
                [&](integer& c) {
                    int value;
                    successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::int_, value);
                    if(successful) {
                        std::cout << "Integer: " << value << '\n';
                    }
                },
                [&](real& c) {
                    double value;
                    successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::double_, value);
                    if(successful) {
                        std::cout << "Real: " << value << '\n';
                    }
                }
            },
            columns[pos]);

        if(successful) {
            pos = (pos + 1) % columns.size();
            return true;
        } else {
            f = saved_f;
            return false;
        }
    }
};


int main(int argc, char *argv[])
{
    std::string input = "Hello,1,13.7,XXX\nWorld,2,1e3,YYY";

    // Comes from external source.
    std::vector<column_variant> columns = {text{}, integer{}, real{}, skip{}};
    size_t pos = 0;

    boost::spirit::x3::parse(
        input.begin(), input.end(),
//         (column_value_parser(columns, pos) % ',') % boost::spirit::x3::eol);
        (column_value_parser(columns) % ',') % boost::spirit::x3::eol);
}

XY:我的目标是在具有少量RAM的机器上在合理的时间内解析~500 GB的伪CSV文件,转换为(大致)[行号,列名,值]的列表,然后存入。格式实际上比CSV更复杂:数据库转储格式化为......人性化的方式,列值实际上是几个小的子语言(例如日期或呃,类似于整个apache日志行填充到单个字段中的东西),而且我经常只提取每列的一个特定部分。不同的文件可能具有不同的列和不同的顺序,我只能通过解析包含原始查询的另一组文件来学习。值得庆幸的是,Spirit让它变得轻而易举......

1 个答案:

答案 0 :(得分:3)

三个答案:

  1. 最简单的解决方法是让pos成为mutable成员
  2. X3核心答案为x3::with<>
  3. 功能组合
  4. 1。使pos可变

    <强> Live On Wandbox

    #include <iostream>
    #include <variant>
    
    #include <boost/spirit/home/x3.hpp>
    #include <boost/spirit/home/support.hpp>
    
    namespace helpers {
        // https://bitbashing.io/std-visit.html
        template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
        template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
    }
    
    auto const unquoted_text_field = *(boost::spirit::x3::char_ - ',' - boost::spirit::x3::eol);
    
    struct text { };
    struct integer { };
    struct real { };
    struct skip { };
    typedef std::variant<text, integer, real, skip> column_variant;
    
    struct column_value_parser : boost::spirit::x3::parser<column_value_parser> {
        typedef boost::spirit::unused_type attribute_type;
    
        std::vector<column_variant>& columns;
        size_t mutable pos = 0;
        struct pos_tag;
    
        column_value_parser(std::vector<column_variant>& columns)
            : columns(columns)
        { }
    
        template<typename It, typename Ctx, typename Other, typename Attr>
        bool parse(It& f, It l, Ctx& /*ctx*/, Other const& /*other*/, Attr& /*attr*/) const {
            auto const saved_f = f;
            bool successful = false;
    
            visit(
                helpers::overloaded {
                    [&](skip const&) {
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::omit[unquoted_text_field]);
                    },
                    [&](text&) {
                        std::string value;
                        successful = boost::spirit::x3::parse(f, l, unquoted_text_field, value);
                        if(successful) {
                            std::cout << "Text: " << value << '\n';
                        }
                    },
                    [&](integer&) {
                        int value;
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::int_, value);
                        if(successful) {
                            std::cout << "Integer: " << value << '\n';
                        }
                    },
                    [&](real&) {
                        double value;
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::double_, value);
                        if(successful) {
                            std::cout << "Real: " << value << '\n';
                        }
                    }
                },
                columns[pos]);
    
            if(successful) {
                pos = (pos + 1) % columns.size();
                return true;
            } else {
                f = saved_f;
                return false;
            }
        }
    };
    
    
    int main() {
        std::string input = "Hello,1,13.7,XXX\nWorld,2,1e3,YYY";
    
        std::vector<column_variant> columns = {text{}, integer{}, real{}, skip{}};
    
        boost::spirit::x3::parse(
            input.begin(), input.end(),
            (column_value_parser(columns) % ',') % boost::spirit::x3::eol);
    }
    

    2。 x3::with<>

    这是类似的,但有更好的(重新)入侵和封装:

    <强> Live On Wandbox

    #include <iostream>
    #include <variant>
    
    #include <boost/spirit/home/x3.hpp>
    #include <boost/spirit/home/support.hpp>
    
    namespace helpers {
        // https://bitbashing.io/std-visit.html
        template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
        template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
    }
    
    auto const unquoted_text_field = *(boost::spirit::x3::char_ - ',' - boost::spirit::x3::eol);
    
    struct text { };
    struct integer { };
    struct real { };
    struct skip { };
    typedef std::variant<text, integer, real, skip> column_variant;
    
    struct column_value_parser : boost::spirit::x3::parser<column_value_parser> {
        typedef boost::spirit::unused_type attribute_type;
    
        std::vector<column_variant>& columns;
    
        column_value_parser(std::vector<column_variant>& columns)
            : columns(columns)
        { }
    
        template<typename It, typename Ctx, typename Other, typename Attr>
        bool parse(It& f, It l, Ctx const& ctx, Other const& /*other*/, Attr& /*attr*/) const {
            auto const saved_f = f;
            bool successful = false;
    
            size_t& pos = boost::spirit::x3::get<pos_tag>(ctx).value;
    
            visit(
                helpers::overloaded {
                    [&](skip const&) {
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::omit[unquoted_text_field]);
                    },
                    [&](text&) {
                        std::string value;
                        successful = boost::spirit::x3::parse(f, l, unquoted_text_field, value);
                        if(successful) {
                            std::cout << "Text: " << value << '\n';
                        }
                    },
                    [&](integer&) {
                        int value;
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::int_, value);
                        if(successful) {
                            std::cout << "Integer: " << value << '\n';
                        }
                    },
                    [&](real&) {
                        double value;
                        successful = boost::spirit::x3::parse(f, l, boost::spirit::x3::double_, value);
                        if(successful) {
                            std::cout << "Real: " << value << '\n';
                        }
                    }
                },
                columns[pos]);
    
            if(successful) {
                pos = (pos + 1) % columns.size();
                return true;
            } else {
                f = saved_f;
                return false;
            }
        }
    
        template <typename T>
        struct Mutable { T mutable value; };
        struct pos_tag;
    
        auto invoke() const {
            return boost::spirit::x3::with<pos_tag>(Mutable<size_t>{}) [ *this ];
        }
    };
    
    
    int main() {
        std::string input = "Hello,1,13.7,XXX\nWorld,2,1e3,YYY";
    
        std::vector<column_variant> columns = {text{}, integer{}, real{}, skip{}};
        column_value_parser p(columns);
    
        boost::spirit::x3::parse(
            input.begin(), input.end(),
            (p.invoke() % ',') % boost::spirit::x3::eol);
    }
    

    3。功能性组合物

    因为它在X3中更容易,我最喜欢的是按需生成解析器。

    没有要求,这是我最简单的建议:

    <强> Live On Wandbox

    #include <boost/spirit/home/x3.hpp>
    namespace x3 = boost::spirit::x3;
    
    namespace CSV {
        struct text    { };
        struct integer { };
        struct real    { };
        struct skip    { };
    
        auto const unquoted_text_field = *~x3::char_(",\n");
        static inline auto as_parser(skip)    { return x3::omit[unquoted_text_field]; }
        static inline auto as_parser(text)    { return unquoted_text_field;           }
        static inline auto as_parser(integer) { return x3::int_;                      }
        static inline auto as_parser(real)    { return x3::double_;                   }
    
        template <typename... Spec>
        static inline auto line_parser(Spec... spec) {
            auto delim = ',' | &(x3::eoi | x3::eol);
            return ((as_parser(spec) >> delim) >> ... >> x3::eps);
        }
    
        template <typename... Spec> static inline auto csv_parser(Spec... spec) {
            return line_parser(spec...) % x3::eol;
        }
    }
    
    #include <iostream>
    #include <iomanip>
    using namespace CSV;
    
    int main() {
        std::string const input = "Hello,1,13.7,XXX\nWorld,2,1e3,YYY";
        auto f = begin(input), l = end(input);
    
        auto p = csv_parser(text{}, integer{}, real{}, skip{});
    
        if (parse(f, l, p)) {
            std::cout << "Parsed\n";
        } else {
            std::cout << "Failed\n";
        }
    
        if (f!=l) {
            std::cout << "Remaining: " << std::quoted(std::string(f,l)) << "\n";
        }
    }
    

    启用了调试信息的版本:

    <强> Live On Wandbox

    <line>
      <try>Hello,1,13.7,XXX\nWor</try>
      <CSV::text>
        <try>Hello,1,13.7,XXX\nWor</try>
        <success>,1,13.7,XXX\nWorld,2,</success>
      </CSV::text>
      <CSV::integer>
        <try>1,13.7,XXX\nWorld,2,1</try>
        <success>,13.7,XXX\nWorld,2,1e</success>
      </CSV::integer>
      <CSV::real>
        <try>13.7,XXX\nWorld,2,1e3</try>
        <success>,XXX\nWorld,2,1e3,YYY</success>
      </CSV::real>
      <CSV::skip>
        <try>XXX\nWorld,2,1e3,YYY</try>
        <success>\nWorld,2,1e3,YYY</success>
      </CSV::skip>
      <success>\nWorld,2,1e3,YYY</success>
    </line>
    <line>
      <try>World,2,1e3,YYY</try>
      <CSV::text>
        <try>World,2,1e3,YYY</try>
        <success>,2,1e3,YYY</success>
      </CSV::text>
      <CSV::integer>
        <try>2,1e3,YYY</try>
        <success>,1e3,YYY</success>
      </CSV::integer>
      <CSV::real>
        <try>1e3,YYY</try>
        <success>,YYY</success>
      </CSV::real>
      <CSV::skip>
        <try>YYY</try>
        <success></success>
      </CSV::skip>
      <success></success>
    </line>
    Parsed
    

    注释,注意事项:

    • 对于任何mutable,请注意副作用。例如。如果您有a | ba包含column_value_parser,则pos会在a }时回滚增加b的副作用。失败,而Skip(1)匹配。

      简而言之,这会使您的解析功能不纯。