我正在尝试解析一种语言,其中一元减号与二进制减去与符号周围存在的空格的区别。下面是一些伪规则,定义了如何用这种语言解释减号:
-x // unary
x - y // binary
x-y // binary
x -y // unary
x- y // binary
(- y ... // unary
注意:除了'identifier','number'和'close_paren'之外,最后一条规则中的open paren可以替换为语言中的任何标记。
注意:在第4种情况下,x是标识符。标识符可以构成自己的语句。 -y是一个单独的陈述。
由于减号类型取决于空格,我认为我有两个不同的标量返回的标记,一个用于一元减号,一个用于二进制减号。任何想法我该怎么做?
代码:这里有一些适用于我的代码,但我不太确定它是否足够强大。我试图通过删除所有不相关的词法分析器规则来简化:
#ifndef LEXER_H
#define LEXER_H
#include <iostream>
#include <algorithm>
#include <string>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_function.hpp>
#include <boost/spirit/include/phoenix_algorithm.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#define BOOST_SPIRIT_LEXERTL_DEBUG 1
using std::string;
using std::cerr;
namespace skill {
namespace lex = boost::spirit::lex;
namespace phoenix = boost::phoenix;
// base iterator type
typedef string::iterator BaseIteratorT;
// token type
typedef lex::lexertl::token<BaseIteratorT, boost::mpl::vector<int, string> > TokenT;
// lexer type
typedef lex::lexertl::actor_lexer<TokenT> LexerT;
template <typename LexerT>
struct Tokens: public lex::lexer<LexerT>
{
Tokens(const string& input):
lineNo_(1)
{
using lex::_start;
using lex::_end;
using lex::_pass;
using lex::_state;
using lex::_tokenid;
using lex::_val;
using lex::omit;
using lex::pass_flags;
using lex::token_def;
using phoenix::ref;
using phoenix::count;
using phoenix::construct;
// macros
this->self.add_pattern
("EXP", "(e|E)(\\+|-)?\\d+")
("SUFFIX", "[yzafpnumkKMGTPEZY]")
("INTEGER", "-?\\d+")
("FLOAT", "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
("SYMBOL", "[a-zA-Z_?@](\\w|\\?|@)*")
("STRING", "\\\"([^\\\"]|\\\\\\\")*\\\"");
// whitespaces and comments
whitespaces_ = "\\s+";
comments_ = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";
// literals
float_ = "{FLOAT}";
integer_ = "{INTEGER}";
string_ = "{STRING}";
symbol_ = "{SYMBOL}";
// operators
plus_ = '+';
difference_ = '-';
minus_ = "-({SYMBOL}|\\()";
// ... more operators
// whitespace
this->self += whitespaces_
[
ref(lineNo_) += count(construct<string>(_start, _end), '\n'),
_pass = pass_flags::pass_ignore
];
// a minus between two identifiers, numbers or close-open parens is a binary minus, so add spaces around it
this->self += token_def<omit>("[)a-zA-Z?_0-9]-[(a-zA-Z?_0-9]")
[
unput(_start, _end, *_start + construct<string>(" ") + *(_start + 1) + " " + *(_start + 2)),
_pass = pass_flags::pass_ignore
];
// operators (except for close-brackets) cannot be followed by a binary minus
this->self += token_def<omit>("['`.+*<>/!~&|({\\[=,:@](\\s+-\\s*|\\s*-\\s+)")
[
unput(_start, _end, *_start + construct<string>("-")),
_pass = pass_flags::pass_ignore
];
// a minus directly preceding a symbol or an open paren is a unary minus
this->self += minus_
[
unput(_start, _end, construct<string>(_start + 1, _end)),
_val = construct<string>("-")
];
// literal rules
this->self += float_ | integer_ | string_ | symbol_;
// ... other rules
}
~Tokens() {}
size_t lineNo() { return lineNo_; }
// ignored tokens
token_def<omit> whitespaces_, comments_;
// literal tokens
token_def<int> integer_;
token_def<string> float_, symbol_, string_;
// operator tokens
token_def<> plus_, difference_, minus_; // minus_ is a unary minus
// ... other tokens
// current line number
size_t lineNo_;
};
}
#endif // LEXER_H
基本上,我将二进制减号(在代码中称为difference
)定义为任何两边都有空格的减号,并使用unput来确保此规则。我还将一元减号定义为直接在符号或开放式paren之前的减号,并再次使用unput来确保维持此规则(对于数字,减号是令牌的一部分)。