在Boost Spirit中使用unicode预定义字符类

时间:2015-01-01 16:46:30

标签: boost unicode boost-spirit lexer

我正在尝试使用unicode中的字母字符类,即\ p {L}和Boost Spirit,但到目前为止我没有运气。下面是我尝试使用(第30行)\ p {L}字符类的示例。当我用第29行替换第30行时,它可以工作,但这不是预期用途,因为我需要在我的示例中使用Unicode中的任何字母。 我的用例仅适用于UTF8。在他们这一天结束的时候,我在这里尝试做的是在使用boost-spirit lexer时减去所有unicode字母的unicode范围。

PS 当然,我的例子很简单,作为一个用例可能没有多大意义,但我希望你能得到这个想法。

#include <boost/config/warning_disable.hpp>

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/fusion/include/std_pair.hpp>

#include <iostream>
#include <fstream>
#include <chrono>
#include <vector>

using namespace boost;
using namespace boost::spirit;
using namespace std;
using namespace std::chrono;

std::vector<pair<string, string> > getTokenMacros() {

    std::vector<pair<string, string> > tokenDefinitionsVector;

    tokenDefinitionsVector.emplace_back("JAPANESE_HIRAGANA", "[\u3041-\u3096]");
    tokenDefinitionsVector.emplace_back("JAPANESE_HIRAGANA1",
            "[\u3099-\u309E]");

    tokenDefinitionsVector.emplace_back("ASIAN_NWS", "{JAPANESE_HIRAGANA}|"
            "{JAPANESE_HIRAGANA1}");

    tokenDefinitionsVector.emplace_back("ASIAN_NWS_WORD", "{ASIAN_NWS}*");

    //tokenDefinitionsVector.emplace_back("NON_ASIAN_LETTER", "[A-Za-z0-9]");
    tokenDefinitionsVector.emplace_back("NON_ASIAN_LETTER", "[\\p{L}-[{ASIAN_NWS}]]");

    tokenDefinitionsVector.emplace_back("WORD", "{NON_ASIAN_LETTER}+");
    tokenDefinitionsVector.emplace_back("ANY", ".");

    return tokenDefinitionsVector;
}
;

struct distance_func {
    template<typename Iterator1, typename Iterator2>
    struct result: boost::iterator_difference<Iterator1> {
    };

    template<typename Iterator1, typename Iterator2>
    typename result<Iterator1, Iterator2>::type operator()(Iterator1& begin,
            Iterator2& end) const {
        return distance(begin, end);
    }
};

boost::phoenix::function<distance_func> const distance_fctor = distance_func();

template<typename Lexer>
struct word_count_tokens: lex::lexer<Lexer> {
    word_count_tokens() :
            asianNwsWord("{ASIAN_NWS_WORD}", lex::min_token_id + 110), word(
                    "{WORD}", lex::min_token_id + 170), any("{ANY}",
                    lex::min_token_id + 3000) {
        using lex::_start;
        using lex::_end;
        using boost::phoenix::ref;

        std::vector<pair<string, string> > tokenMacros(getTokenMacros());
        for (auto start = tokenMacros.begin(), end = tokenMacros.end();
                start != end; start++) {
            this->self.add_pattern(start->first, start->second);
        }

        this->self = asianNwsWord | word | any;
    }

    lex::token_def<> asianNwsWord, word, any;

};

int main(int argc, char* argv[]) {

    typedef lex::lexertl::token<string::iterator> token_type;
    typedef lex::lexertl::actor_lexer<token_type> lexer_type;

    word_count_tokens<lexer_type> word_count_lexer;

    // read in the file int memory
    ifstream sampleFile("/home/dan/Documents/wikiSample.txt");

    string str = "abc efg ぁあ";
    string::iterator first = str.begin();
    string::iterator last = str.end();

    lexer_type::iterator_type iter = word_count_lexer.begin(first, last);
    lexer_type::iterator_type end = word_count_lexer.end();

    typedef boost::iterator_range<string::iterator> iterator_range;
    vector<iterator_range> parsed_tokens;

    while (iter != end && token_is_valid(*iter)) {
        cout << (iter->id() - lex::min_token_id) << " " << iter->value()
                << endl;
        const iterator_range range = get<iterator_range>(iter->value());
        parsed_tokens.push_back(range);
        ++iter;
    }

    if (iter != end) {
        string rest(first, last);
        cout << endl << "!!!!!!!!!" << endl << "Lexical analysis failed\n"
                << "stopped at: \"" << rest << "\"" << endl;
        cout << "#" << (int) rest.at(0) << "#" << endl;
    }

    return 0;
}

0 个答案:

没有答案