我一直在解析包含日期的日志文件并将它们存储为字符串,但由于字符串分配,这会耗费大量内存并且成本很高。
我被建议使用Timestamp存储日期和boost精神流来解析它,所以我尝试了“boost :: posix_time :: ptime”和旧的“std :: time_t + std :: get_time”,但他们都严重伤害了表现。
我想尝试一下这个新方法:将日期解析为普通的int,然后使用特征将它们转换为std :: time_t。我尽可能地遵循下一个精神示例https://www.boost.org/doc/libs/1_60_0/libs/spirit/example/qi/parse_date.cpp,但我甚至无法自己编写测试。晦涩的提升精神transform_attribute语法也无济于事。
有人可以帮我解决这个问题吗?
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <ctime>
typedef std::string::const_iterator It;
namespace structs {
typedef boost::fusion::vector<int, int, int, int, int, int, int> date_parts;
struct Timestamp
{
std::time_t date;
int ms;
};
struct Record {
Timestamp date;
double time;
std::string str;
};
typedef std::vector<Record> Records;
}
BOOST_FUSION_ADAPT_STRUCT(structs::Record,
(Timestamp, date)
(double, time)
(std::string, str))
namespace boost { namespace spirit { namespace traits
{
template<>
struct transform_attribute<structs::Timestamp, structs::date_parts, qi::domain>
{
typedef structs::date_parts type;
static type pre(structs::Timestamp) { return type(); }
static void fail(structs::Timestamp&) { }
static void post(structs::Timestamp& timestamp, type const& v)
{
std::tm time = { fusion::at_c<5>(v), fusion::at_c<4>(v), fusion::at_c<3>(v),
fusion::at_c<2>(v), fusion::at_c<1>(v) - 1900, fusion::at_c<0>(v), 0, 0, 0 };
timestamp.date = std::mktime(&time);
timestamp.ms = fusion::at_c<6>(v);
}
};
} } }
namespace qi = boost::spirit::qi;
namespace QiParsers {
template <typename It>
struct Parser : qi::grammar<It, structs::Records()> {
Parser() : Parser::base_type(start) {
using namespace qi;
month.add
("Jan", 0)
("Feb", 1)
("Mar", 2)
("Apr", 3)
("May", 4)
("Jun", 5)
("Jul", 6)
("Aug", 7)
("Sep", 8)
("Oct", 9)
("Nov", 10)
("Dec", 11);
date = repeat(4)[digit] >> '-' >> month >> '-' >> repeat(2)[digit] >> ' ' >>
repeat(2)[digit] >> ':' >> repeat(2)[digit] >> ':' >> repeat(2)[digit] >> '.' >> repeat(6)[digit];
line = '[' >> date >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph];
start = line % eol;
}
private:
qi::symbols<char, int> month;
qi::rule<It, structs::date_parts()> date;
qi::rule<It, structs::Record()> line;
qi::rule<It, structs::Records()> start;
};
}
structs::Records parse_string(It b, It e)
{
static const QiParsers::Parser<It> parser;
structs::Records records;
parse(b, e, parser, records);
return records;
}
static const std::string input =
"[2018-Mar-13 13:13:59.580482] - 0.200 s => String: Test_1\n\
[2018-Mar-14 13:13:59.580482] - 0.400 s => String: Test_2\n\
[2018-Mar-15 13:13:59.580482] - 0.600 s => String: Test_3\n";
int main() {
const auto records = parse_string(input.begin(), input.end());
return 0;
}
答案 0 :(得分:1)
发布Why does using a stream in boost spirit penalize performance so much?后,我重新阅读了您的帖子并在此处添加了该方法。
关于特征和解析器规则的声明方式存在相当多的问题。
值得注意的是,repeat(2)[digit_]
不会转换为整数属性。我怀疑你可能已经获得了很多49,50等值(ASCII代码为'1'
,'2'
等),也许还有一些不确定的值
您从月份值中扣除了1900
简化为:
namespace QiParsers {
struct Months : qi::symbols<char, int> {
Months() { this->add
("Jan", 0)
("Feb", 1)
("Mar", 2)
("Apr", 3)
("May", 4)
("Jun", 5)
("Jul", 6)
("Aug", 7)
("Sep", 8)
("Oct", 9)
("Nov", 10)
("Dec", 11);
}
} static const mmm_;
static const qi::uint_parser<int, 10, 4, 4> yyyy_;
static const qi::uint_parser<int, 10, 2, 2> dd_, hh_, mm_, ss_;
static const qi::uint_parser<int, 10, 6, 6> fff_;
}
现在解析器可以清晰地写成¹:
template <typename It>
struct Parser2 : qi::grammar<It, structs::Record2()>
{
Parser2() : Parser2::base_type(start) {
using namespace qi;
date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
>> ' ' >> hh_ >> ':' >> mm_ >> ':' >> ss_ >> '.' >> fff_ >> ']';
start =
date //'[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph]
>> eol;
}
private:
qi::rule<It, structs::Record2()> start;
qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
};
基本上你拥有的东西,但要解决一些怪癖:
template <typename Attr>
struct transform_attribute<structs::Timestamp, Attr, qi::domain> {
using type = Attr;
static type pre(structs::Timestamp) { return type(); }
static void fail(structs::Timestamp&) { }
static void post(structs::Timestamp& timestamp, type const& v) {
/*
* struct tm
* {
* int tm_sec; [> Seconds. [0-60] (1 leap second) <]
* int tm_min; [> Minutes. [0-59] <]
* int tm_hour; [> Hours. [0-23] <]
* int tm_mday; [> Day. [1-31] <]
* int tm_mon; [> Month. [0-11] <]
* int tm_year; [> Year - 1900. <]
* int tm_wday; [> Day of week. [0-6] <]
* int tm_yday; [> Days in year.[0-365] <]
* int tm_isdst; [> DST. [-1/0/1]<]
*
* # ifdef __USE_MISC
* long int tm_gmtoff; [> Seconds east of UTC. <]
* const char *tm_zone; [> Timezone abbreviation. <]
* # else
* long int __tm_gmtoff; [> Seconds east of UTC. <]
* const char *__tm_zone; [> Timezone abbreviation. <]
* # endif
* };
*/
std::tm time = { fusion::at_c<5>(v), // seconds
fusion::at_c<4>(v), // minutes
fusion::at_c<3>(v), // hours
fusion::at_c<2>(v), // day (1-31)
fusion::at_c<1>(v), // month
fusion::at_c<0>(v) - 1900, // year - 1900
0, 0, // wday, yday
0, 0, 0 // isdst, tm_gmtoff, tm_zone
};
timestamp.date = std::mktime(&time);
timestamp.ms = fusion::at_c<6>(v)/1000000.0;
}
};
基准测试运行并正确解析:
<强> Live On Coliru 强>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_seek.hpp>
#include <boost/chrono/chrono.hpp>
#include <iomanip>
#include <ctime>
namespace structs {
struct Timestamp {
std::time_t date;
double ms;
};
struct Record1 {
std::string date;
double time;
std::string str;
};
struct Record2 {
Timestamp date;
double time;
std::string str;
};
typedef std::vector<Record1> Records1;
typedef std::vector<Record2> Records2;
}
BOOST_FUSION_ADAPT_STRUCT(structs::Record1,
(std::string, date)
(double, time)
(std::string, str))
BOOST_FUSION_ADAPT_STRUCT(structs::Record2,
(structs::Timestamp, date)
(double, time)
(std::string, str))
namespace boost { namespace spirit { namespace traits {
template <typename It>
struct assign_to_attribute_from_iterators<std::string, It, void> {
static inline void call(It f, It l, std::string& attr) {
attr = std::string(&*f, std::distance(f,l));
}
};
template <typename Attr>
struct transform_attribute<structs::Timestamp, Attr, qi::domain> {
using type = Attr;
static type pre(structs::Timestamp) { return type(); }
static void fail(structs::Timestamp&) { }
static void post(structs::Timestamp& timestamp, type const& v) {
/*
* struct tm
* {
* int tm_sec; [> Seconds. [0-60] (1 leap second) <]
* int tm_min; [> Minutes. [0-59] <]
* int tm_hour; [> Hours. [0-23] <]
* int tm_mday; [> Day. [1-31] <]
* int tm_mon; [> Month. [0-11] <]
* int tm_year; [> Year - 1900. <]
* int tm_wday; [> Day of week. [0-6] <]
* int tm_yday; [> Days in year.[0-365] <]
* int tm_isdst; [> DST. [-1/0/1]<]
*
* # ifdef __USE_MISC
* long int tm_gmtoff; [> Seconds east of UTC. <]
* const char *tm_zone; [> Timezone abbreviation. <]
* # else
* long int __tm_gmtoff; [> Seconds east of UTC. <]
* const char *__tm_zone; [> Timezone abbreviation. <]
* # endif
* };
*/
std::tm time = { fusion::at_c<5>(v), // seconds
fusion::at_c<4>(v), // minutes
fusion::at_c<3>(v), // hours
fusion::at_c<2>(v), // day (1-31)
fusion::at_c<1>(v), // month
fusion::at_c<0>(v) - 1900, // year - 1900
0, 0, // wday, yday
0, 0, 0 // isdst, tm_gmtoff, tm_zone
};
timestamp.date = std::mktime(&time);
timestamp.ms = fusion::at_c<6>(v)/1000000.0;
}
};
} } }
namespace qi = boost::spirit::qi;
namespace QiParsers {
struct Months : qi::symbols<char, int> {
Months() { this->add
("Jan", 0)
("Feb", 1)
("Mar", 2)
("Apr", 3)
("May", 4)
("Jun", 5)
("Jul", 6)
("Aug", 7)
("Sep", 8)
("Oct", 9)
("Nov", 10)
("Dec", 11);
}
} static const mmm_;
static const qi::uint_parser<int, 10, 4, 4> yyyy_;
static const qi::uint_parser<int, 10, 2, 2> dd_, hh_, mm_, ss_;
static const qi::uint_parser<int, 10, 6, 6> fff_;
template <typename It>
struct Parser1 : qi::grammar<It, structs::Record1()>
{
Parser1() : Parser1::base_type(start) {
using namespace qi;
start = '[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph]
>> eol;
}
private:
qi::rule<It, structs::Record1()> start;
};
template <typename It>
struct Parser2 : qi::grammar<It, structs::Record2()>
{
Parser2() : Parser2::base_type(start) {
using namespace qi;
date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
>> ' ' >> hh_ >> ':' >> mm_ >> ':' >> ss_ >> '.' >> fff_ >> ']';
start =
date //'[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph]
>> eol;
}
private:
qi::rule<It, structs::Record2()> start;
qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
};
template <typename It>
struct Parser3 : qi::grammar<It, structs::Records1()>
{
Parser3() : Parser3::base_type(start) {
using namespace qi;
using boost::phoenix::push_back;
line = '[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph];
ignore = *~char_("\r\n");
start = (line[push_back(_val, _1)] | ignore) % eol;
}
private:
qi::rule<It> ignore;
qi::rule<It, structs::Record1()> line;
qi::rule<It, structs::Records1()> start;
};
template <typename It>
struct Parser4 : qi::grammar<It, structs::Records2()>
{
Parser4() : Parser4::base_type(start) {
using namespace qi;
using boost::phoenix::push_back;
date = '[' >> yyyy_ >> '-' >> mmm_ >> '-' >> dd_
>> ' ' >> hh_ >> ':' >> mm_ >> ':' >> ss_ >> '.' >> fff_ >> ']';
line = date
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph];
ignore = *~char_("\r\n");
start = (line[push_back(_val, _1)] | ignore) % eol;
}
private:
qi::rule<It> ignore;
qi::rule<It, structs::Record2()> line;
qi::rule<It, structs::Records2()> start;
qi::rule<It, boost::fusion::vector<int, int, int, int, int, int, int>()> date;
};
}
template <typename Parser> static const Parser s_instance {};
template<template <typename> class Parser, typename Container, typename It>
Container parse_seek(It b, It e, const std::string& message)
{
Container records;
auto const t0 = boost::chrono::high_resolution_clock::now();
parse(b, e, *boost::spirit::repository::qi::seek[s_instance<Parser<It> >], records);
auto const t1 = boost::chrono::high_resolution_clock::now();
auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";
return records;
}
template<template <typename> class Parser, typename Container, typename It>
Container parse_ignoring(It b, It e, const std::string& message)
{
Container records;
auto const t0 = boost::chrono::high_resolution_clock::now();
parse(b, e, s_instance<Parser<It> >, records);
auto const t1 = boost::chrono::high_resolution_clock::now();
auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";
return records;
}
static const std::string input1 = "[2018-Mar-01 00:01:02.012345] - 1.000 s => String: Valid_string\n";
static const std::string input2 = "[2018-Mar-02 00:01:02.012345] - 2.000 s => I dont care\n";
std::string prepare_input() {
std::string input;
const int N1 = 10;
const int N2 = 1000;
input.reserve(N1 * (input1.size() + N2*input2.size()));
for (int i = N1; i--;) {
input += input1;
for (int j = N2; j--;)
input += input2;
}
return input;
}
int main() {
auto const input = prepare_input();
auto f = input.data(), l = f + input.length();
for (auto& r: parse_seek<QiParsers::Parser1, structs::Records1>(f, l, "std::string + seek")) {
std::cout << r.date << "\n";
break;
}
for (auto& r: parse_seek<QiParsers::Parser2, structs::Records2>(f, l, "stream + seek")) {
auto tm = *std::localtime(&r.date.date);
std::cout << std::put_time(&tm, "%Y-%b-%d %H:%M:%S") << " " << r.date.ms << "\n";
break;
}
for (auto& r: parse_ignoring<QiParsers::Parser3, structs::Records1>(f, l, "std::string + ignoring")) {
std::cout << r.date << "\n";
break;
}
for (auto& r: parse_ignoring<QiParsers::Parser4, structs::Records2>(f, l, "stream + ignoring")) {
auto tm = *std::localtime(&r.date.date);
std::cout << std::put_time(&tm, "%Y-%b-%d %H:%M:%S") << " " << r.date.ms << "\n";
break;
}
}
打印
Elapsed time: 14 ms (std::string + seek)
2018-Mar-01 00:01:02.012345
Elapsed time: 42 ms (stream + seek)
2018-Mar-01 00:01:02 0.012345
Elapsed time: 2 ms (std::string + ignoring)
2018-Mar-01 00:01:02.012345
Elapsed time: 31 ms (stream + ignoring)
2018-Mar-01 00:01:02 0.012345
解析和mktime
费用很高(下面的个人资料的10%)。除非您愿意选择退出boost::posix_time::from_time_string
,否则您不会比std::time_t
做得更好。
此方法的一个值得注意的优势是,如果忽略某行,则对mktime
的调用不。它显示:
忽略解析器现在确实更快比基于字符串的非忽略解析器。
概要分析图:
¹从另一个答案中获取代码,因此很容易比较基准测试结果