使用特征在boost :: spirit中解析日期

时间:2018-04-05 11:49:57

标签: boost boost-spirit

我一直在解析包含日期的日志文件并将它们存储为字符串,但由于字符串分配,这会耗费大量内存并且成本很高。

我被建议使用Timestamp存储日期和boost精神流来解析它,所以我尝试了“boost :: posix_time :: ptime”和旧的“std :: time_t + std :: get_time”,但他们都严重伤害了表现。

我想尝试一下这个新方法:将日期解析为普通的int,然后使用特征将它们转换为std :: time_t。我尽可能地遵循下一个精神示例https://www.boost.org/doc/libs/1_60_0/libs/spirit/example/qi/parse_date.cpp,但我甚至无法自己编写测试。晦涩的提升精神transform_attribute语法也无济于事。

有人可以帮我解决这个问题吗?

#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <ctime>

typedef std::string::const_iterator It;

namespace structs {
    typedef boost::fusion::vector<int, int, int, int, int, int, int> date_parts;

    struct Timestamp
    {
        std::time_t date;
        int ms;
    };

    struct Record {
        Timestamp date;
        double time;
        std::string str;
    };

    typedef std::vector<Record> Records;
}

BOOST_FUSION_ADAPT_STRUCT(structs::Record,
        (Timestamp, date)
        (double, time)
        (std::string, str))

namespace boost { namespace spirit { namespace traits
{
    template<>
    struct transform_attribute<structs::Timestamp, structs::date_parts, qi::domain>
    {
        typedef structs::date_parts type;

        static type pre(structs::Timestamp) { return type(); }
        static void fail(structs::Timestamp&) { }
        static void post(structs::Timestamp& timestamp, type const& v)
        {
            std::tm time = { fusion::at_c<5>(v), fusion::at_c<4>(v), fusion::at_c<3>(v),
                             fusion::at_c<2>(v), fusion::at_c<1>(v) - 1900, fusion::at_c<0>(v), 0, 0, 0 };

            timestamp.date = std::mktime(&time);
            timestamp.ms = fusion::at_c<6>(v);
        }
    };
} } }

namespace qi = boost::spirit::qi;

namespace QiParsers {
    template <typename It>
    struct Parser : qi::grammar<It, structs::Records()> {

        Parser() : Parser::base_type(start) {
            using namespace qi;

            month.add
                ("Jan", 0)
                ("Feb", 1)
                ("Mar", 2)
                ("Apr", 3)
                ("May", 4)
                ("Jun", 5)
                ("Jul", 6)
                ("Aug", 7)
                ("Sep", 8)
                ("Oct", 9)
                ("Nov", 10)
                ("Dec", 11);

            date = repeat(4)[digit] >> '-' >> month >> '-' >> repeat(2)[digit] >> ' ' >> 
                   repeat(2)[digit] >> ':' >> repeat(2)[digit] >> ':' >> repeat(2)[digit] >> '.' >> repeat(6)[digit];

            line = '[' >> date >> ']'
                >> " - " >> double_ >> " s"
                >> " => String: "  >> raw[+graph];

            start = line % eol;
        }

      private:
        qi::symbols<char, int> month;

        qi::rule<It, structs::date_parts()> date;
        qi::rule<It, structs::Record()> line;
        qi::rule<It, structs::Records()> start;

    };
}

structs::Records parse_string(It b, It e)
{
    static const QiParsers::Parser<It> parser;

    structs::Records records;
    parse(b, e, parser, records);

    return records;
}

static const std::string input =
"[2018-Mar-13 13:13:59.580482] - 0.200 s => String: Test_1\n\
[2018-Mar-14 13:13:59.580482] - 0.400 s => String: Test_2\n\
[2018-Mar-15 13:13:59.580482] - 0.600 s => String: Test_3\n";

int main() {
    const auto records = parse_string(input.begin(), input.end());

    return 0;
}

1 个答案:

答案 0 :(得分:1)

发布Why does using a stream in boost spirit penalize performance so much?后,我重新阅读了您的帖子并在此处添加了该方法。

关于特征和解析器规则的声明方式存在相当多的问题。

  • 值得注意的是,repeat(2)[digit_]不会转换为整数属性。我怀疑你可能已经获得了很多49,50等值(ASCII代码为'1''2'等),也许还有一些不确定的值

  • 您从月份值中扣除了1900

The Parser

简化为:

namespace QiParsers {

    struct Months : qi::symbols<char, int> {
        Months() { this->add
                ("Jan", 0)
                ("Feb", 1)
                ("Mar", 2)
                ("Apr", 3)
                ("May", 4)
                ("Jun", 5)
                ("Jul", 6)
                ("Aug", 7)
                ("Sep", 8)
                ("Oct", 9)
                ("Nov", 10)
                ("Dec", 11);
        }
    } static const mmm_;

    static const qi::uint_parser<int, 10, 4, 4> yyyy_;
    static const qi::uint_parser<int, 10, 2, 2> dd_, hh_, mm_, ss_;
    static const qi::uint_parser<int, 10, 6, 6> fff_;

}

现在解析器可以清晰地写成¹:

template <typename It>
struct Parser2 : qi::grammar<It, structs::Record2()>
{
    Parser2() : Parser2::base_type(start) {
        using namespace qi;

        date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
            >> ' ' >> hh_   >> ':' >> mm_  >> ':' >> ss_ >> '.' >> fff_ >> ']';

        start = 
            date //'[' >> raw[*~char_(']')] >> ']'
            >> " - " >> double_ >> " s"
            >> " => String: "  >> raw[+graph]
            >> eol;
    }

private:
    qi::rule<It, structs::Record2()> start;
    qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
};

特质

基本上你拥有的东西,但要解决一些怪癖:

template <typename Attr>
struct transform_attribute<structs::Timestamp, Attr, qi::domain> {
    using type = Attr;
    static type pre(structs::Timestamp) { return type(); }
    static void fail(structs::Timestamp&) { }
    static void post(structs::Timestamp& timestamp, type const& v) {
        /*
         * struct tm
         * {
         *   int tm_sec;            [> Seconds. [0-60] (1 leap second) <]
         *   int tm_min;            [> Minutes. [0-59] <]
         *   int tm_hour;           [> Hours.   [0-23] <]
         *   int tm_mday;           [> Day.     [1-31] <]
         *   int tm_mon;            [> Month.   [0-11] <]
         *   int tm_year;           [> Year - 1900.  <]
         *   int tm_wday;           [> Day of week. [0-6] <]
         *   int tm_yday;           [> Days in year.[0-365] <]
         *   int tm_isdst;          [> DST.     [-1/0/1]<]
         * 
         * # ifdef  __USE_MISC
         *   long int tm_gmtoff;        [> Seconds east of UTC.  <]
         *   const char *tm_zone;       [> Timezone abbreviation.  <]
         * # else
         *   long int __tm_gmtoff;      [> Seconds east of UTC.  <]
         *   const char *__tm_zone; [> Timezone abbreviation.  <]
         * # endif
         * };
         */
        std::tm time = { fusion::at_c<5>(v), // seconds
                         fusion::at_c<4>(v), // minutes
                         fusion::at_c<3>(v), // hours
                         fusion::at_c<2>(v), // day (1-31)
                         fusion::at_c<1>(v), // month
                         fusion::at_c<0>(v) - 1900, // year - 1900
                         0, 0, // wday, yday
                         0, 0, 0 // isdst, tm_gmtoff, tm_zone
        };

        timestamp.date = std::mktime(&time);
        timestamp.ms = fusion::at_c<6>(v)/1000000.0;
    }
};

基准测试!

基准测试运行并正确解析:

<强> Live On Coliru

#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_seek.hpp>
#include <boost/chrono/chrono.hpp>
#include <iomanip>
#include <ctime>

namespace structs {
    struct Timestamp {
        std::time_t date;
        double ms;
    };

    struct Record1 {
        std::string date;
        double time;
        std::string str;
    };

    struct Record2 {
        Timestamp date;
        double time;
        std::string str;
    };

    typedef std::vector<Record1> Records1;
    typedef std::vector<Record2> Records2;
}

BOOST_FUSION_ADAPT_STRUCT(structs::Record1,
        (std::string, date)
        (double, time)
        (std::string, str))

BOOST_FUSION_ADAPT_STRUCT(structs::Record2,
        (structs::Timestamp, date)
        (double, time)
        (std::string, str))

namespace boost { namespace spirit { namespace traits {
    template <typename It>
    struct assign_to_attribute_from_iterators<std::string, It, void> {
        static inline void call(It f, It l, std::string& attr) {
            attr = std::string(&*f, std::distance(f,l));
        }
    };

    template <typename Attr>
    struct transform_attribute<structs::Timestamp, Attr, qi::domain> {
        using type = Attr;
        static type pre(structs::Timestamp) { return type(); }
        static void fail(structs::Timestamp&) { }
        static void post(structs::Timestamp& timestamp, type const& v) {
            /*
             * struct tm
             * {
             *   int tm_sec;            [> Seconds. [0-60] (1 leap second) <]
             *   int tm_min;            [> Minutes. [0-59] <]
             *   int tm_hour;           [> Hours.   [0-23] <]
             *   int tm_mday;           [> Day.     [1-31] <]
             *   int tm_mon;            [> Month.   [0-11] <]
             *   int tm_year;           [> Year - 1900.  <]
             *   int tm_wday;           [> Day of week. [0-6] <]
             *   int tm_yday;           [> Days in year.[0-365] <]
             *   int tm_isdst;          [> DST.     [-1/0/1]<]
             * 
             * # ifdef  __USE_MISC
             *   long int tm_gmtoff;        [> Seconds east of UTC.  <]
             *   const char *tm_zone;       [> Timezone abbreviation.  <]
             * # else
             *   long int __tm_gmtoff;      [> Seconds east of UTC.  <]
             *   const char *__tm_zone; [> Timezone abbreviation.  <]
             * # endif
             * };
             */
            std::tm time = { fusion::at_c<5>(v), // seconds
                             fusion::at_c<4>(v), // minutes
                             fusion::at_c<3>(v), // hours
                             fusion::at_c<2>(v), // day (1-31)
                             fusion::at_c<1>(v), // month
                             fusion::at_c<0>(v) - 1900, // year - 1900
                             0, 0, // wday, yday
                             0, 0, 0 // isdst, tm_gmtoff, tm_zone
            };

            timestamp.date = std::mktime(&time);
            timestamp.ms = fusion::at_c<6>(v)/1000000.0;
        }
    };

} } }

namespace qi = boost::spirit::qi;

namespace QiParsers {

    struct Months : qi::symbols<char, int> {
        Months() { this->add
                ("Jan", 0)
                ("Feb", 1)
                ("Mar", 2)
                ("Apr", 3)
                ("May", 4)
                ("Jun", 5)
                ("Jul", 6)
                ("Aug", 7)
                ("Sep", 8)
                ("Oct", 9)
                ("Nov", 10)
                ("Dec", 11);
        }
    } static const mmm_;

    static const qi::uint_parser<int, 10, 4, 4> yyyy_;
    static const qi::uint_parser<int, 10, 2, 2> dd_, hh_, mm_, ss_;
    static const qi::uint_parser<int, 10, 6, 6> fff_;

    template <typename It>
    struct Parser1 : qi::grammar<It, structs::Record1()>
    {
        Parser1() : Parser1::base_type(start) {
            using namespace qi;

            start = '[' >> raw[*~char_(']')] >> ']'
                >> " - " >> double_ >> " s"
                >> " => String: "  >> raw[+graph]
                >> eol;
        }

    private:
        qi::rule<It, structs::Record1()> start;
    };

    template <typename It>
    struct Parser2 : qi::grammar<It, structs::Record2()>
    {
        Parser2() : Parser2::base_type(start) {
            using namespace qi;

            date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
                >> ' ' >> hh_   >> ':' >> mm_  >> ':' >> ss_ >> '.' >> fff_ >> ']';

            start = 
                date //'[' >> raw[*~char_(']')] >> ']'
                >> " - " >> double_ >> " s"
                >> " => String: "  >> raw[+graph]
                >> eol;
        }

    private:
        qi::rule<It, structs::Record2()> start;
        qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
    };

    template <typename It>
    struct Parser3 : qi::grammar<It, structs::Records1()>
    {
        Parser3() : Parser3::base_type(start) {
            using namespace qi;
            using boost::phoenix::push_back;

            line = '[' >> raw[*~char_(']')] >> ']'
                >> " - " >> double_ >> " s"
                >> " => String: "  >> raw[+graph];

            ignore = *~char_("\r\n");

            start = (line[push_back(_val, _1)] | ignore) % eol;
        }

    private:
        qi::rule<It> ignore;
        qi::rule<It, structs::Record1()> line;
        qi::rule<It, structs::Records1()> start;
    };

    template <typename It>
    struct Parser4 : qi::grammar<It, structs::Records2()>
    {
        Parser4() : Parser4::base_type(start) {
            using namespace qi;
            using boost::phoenix::push_back;

            date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
                >> ' ' >> hh_   >> ':' >> mm_  >> ':' >> ss_ >> '.' >> fff_ >> ']';

            line = date
                >> " - " >> double_ >> " s"
                >> " => String: "  >> raw[+graph];

            ignore = *~char_("\r\n");

            start = (line[push_back(_val, _1)] | ignore) % eol;
        }

    private:
        qi::rule<It> ignore;
        qi::rule<It, structs::Record2()> line;
        qi::rule<It, structs::Records2()> start;
        qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
    };
}

template <typename Parser> static const Parser s_instance {};

template<template <typename> class Parser, typename Container, typename It>
Container parse_seek(It b, It e, const std::string& message)
{
    Container records;

    auto const t0 = boost::chrono::high_resolution_clock::now();
    parse(b, e, *boost::spirit::repository::qi::seek[s_instance<Parser<It> >], records);
    auto const t1 = boost::chrono::high_resolution_clock::now();

    auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
    std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";

    return records;
}

template<template <typename> class Parser, typename Container, typename It>
Container parse_ignoring(It b, It e, const std::string& message)
{
    Container records;

    auto const t0 = boost::chrono::high_resolution_clock::now();
    parse(b, e, s_instance<Parser<It> >, records);
    auto const t1 = boost::chrono::high_resolution_clock::now();

    auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
    std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";

    return records;
}

static const std::string input1 = "[2018-Mar-01 00:01:02.012345] - 1.000 s => String: Valid_string\n";
static const std::string input2 = "[2018-Mar-02 00:01:02.012345] - 2.000 s => I dont care\n";

std::string prepare_input() {
    std::string input;
    const int N1 = 10;
    const int N2 = 1000;

    input.reserve(N1 * (input1.size() + N2*input2.size()));

    for (int i = N1; i--;) {
        input += input1;
        for (int j = N2; j--;)
            input += input2;
    }

    return input;
}

int main() {
    auto const input = prepare_input();

    auto f = input.data(), l = f + input.length();

    for (auto& r: parse_seek<QiParsers::Parser1, structs::Records1>(f, l, "std::string + seek")) {
        std::cout << r.date << "\n";
        break;
    }
    for (auto& r: parse_seek<QiParsers::Parser2, structs::Records2>(f, l, "stream + seek")) {
        auto tm = *std::localtime(&r.date.date);
        std::cout << std::put_time(&tm, "%Y-%b-%d %H:%M:%S") << " " << r.date.ms << "\n";
        break;
    }
    for (auto& r: parse_ignoring<QiParsers::Parser3, structs::Records1>(f, l, "std::string + ignoring")) {
        std::cout << r.date << "\n";
        break;
    }
    for (auto& r: parse_ignoring<QiParsers::Parser4, structs::Records2>(f, l, "stream + ignoring")) {
        auto tm = *std::localtime(&r.date.date);
        std::cout << std::put_time(&tm, "%Y-%b-%d %H:%M:%S") << " " << r.date.ms << "\n";
        break;
    }
}

打印

Elapsed time: 14 ms (std::string + seek)
2018-Mar-01 00:01:02.012345
Elapsed time: 42 ms (stream + seek)
2018-Mar-01 00:01:02 0.012345
Elapsed time: 2 ms (std::string + ignoring)
2018-Mar-01 00:01:02.012345
Elapsed time: 31 ms (stream + ignoring)
2018-Mar-01 00:01:02 0.012345

结论

解析和mktime费用很高(下面的个人资料的10%)。除非您愿意选择退出boost::posix_time::from_time_string,否则您不会比std::time_t做得更好。

此方法的一个值得注意的优势是,如果忽略某行,则对mktime的调用。它显示:

  • Parser1:21.12%
  • Parser2:47.60%
  • Parser3:8.91%
  • Parser4:20.57%

忽略解析器现在确实更快比基于字符串的非忽略解析器。

概要分析图:

¹从另一个答案中获取代码,因此很容易比较基准测试结果