我正在玩Flex,BISON和LLVM,创建我自己的编程语言,以了解编译器如何更好地工作。我已经有一个基本的解析器工作,并且在这一系列博客文章http://gnuu.org/2009/09/18/writing-your-own-toy-compiler/
中有一个语法但是,我无法弄清楚如何扩展它以包含字符串,例如string myVar = "testing 123"
。
这是我的代币清单......
[ \t\n] ;
"extern" return TOKEN(TEXTERN);
"return" return TOKEN(TRETURN);
[a-zA-Z_][a-zA-Z0-9_]* SAVE_TOKEN; return TIDENTIFIER;
[0-9]+\.[0-9]* SAVE_TOKEN; return TDOUBLE;
[0-9]+ SAVE_TOKEN; return TINTEGER;
\"[^\n"]+\" SAVE_TOKEN; return TSTRING;
"=" return TOKEN(TEQUAL);
"==" return TOKEN(TCEQ);
"!=" return TOKEN(TCNE);
"<" return TOKEN(TCLT);
"<=" return TOKEN(TCLE);
">" return TOKEN(TCGT);
">=" return TOKEN(TCGE);
"(" return TOKEN(TLPAREN);
")" return TOKEN(TRPAREN);
"{" return TOKEN(TLBRACE);
"}" return TOKEN(TRBRACE);
"=>" return TOKEN(TCLO);
"co" return TOKEN(TCO);
"const" return TOKEN(TCONST);
"let" return TOKEN(TLET);
"." return TOKEN(TDOT);
"," return TOKEN(TCOMMA);
"+" return TOKEN(TPLUS);
"-" return TOKEN(TMINUS);
"*" return TOKEN(TMUL);
"/" return TOKEN(TDIV);
. printf("Unknown token!\n"); yyterminate();
这是我的解析器......
%{
#include "node.h"
#include <cstdio>
#include <cstdlib>
NBlock *programBlock; /* the top level root node of our final AST */
extern int yylex();
void yyerror(const char *s) { std::printf("Error: %s\n", s);std::exit(1); }
%}
/* Represents the many different ways we can access our data */
%union {
Node *node;
NBlock *block;
NExpression *expr;
NStatement *stmt;
NIdentifier *ident;
NVariableDeclaration *var_decl;
std::vector<NVariableDeclaration*> *varvec;
std::vector<NExpression*> *exprvec;
std::string *string;
int token;
}
/* Define our terminal symbols (tokens). This should
match our tokens.l lex file. We also define the node type
they represent.
*/
%token <string> TIDENTIFIER TINTEGER TDOUBLE TSTRING
%token <token> TCEQ TCNE TCLT TCLE TCGT TCGE TEQUAL
%token <token> TLPAREN TRPAREN TLBRACE TRBRACE TCOMMA TDOT TCLO TCO TCONST TLET
%token <token> TPLUS TMINUS TMUL TDIV
%token <token> TRETURN TEXTERN
/* Define the type of node our nonterminal symbols represent.
The types refer to the %union declaration above. Ex: when
we call an ident (defined by union type ident) we are really
calling an (NIdentifier*). It makes the compiler happy.
*/
%type <ident> ident
%type <expr> numeric expr string
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl extern_decl const_func_decl let_func_decl
%type <token> comparison
/* Operator precedence for mathematical operators */
%left TPLUS TMINUS
%left TMUL TDIV
%start program
%%
program : stmts { programBlock = $1; }
;
stmts : stmt { $$ = new NBlock(); $$->statements.push_back($<stmt>1); }
| stmts stmt { $1->statements.push_back($<stmt>2); }
;
stmt : var_decl | func_decl | extern_decl | const_func_decl | let_func_decl
| expr { $$ = new NExpressionStatement(*$1); }
| TRETURN expr { $$ = new NReturnStatement(*$2); }
;
block : TLBRACE stmts TRBRACE { $$ = $2; }
| TLBRACE TRBRACE { $$ = new NBlock(); }
;
var_decl : ident ident { $$ = new NVariableDeclaration(*$1, *$2); }
| ident ident TEQUAL expr { $$ = new NVariableDeclaration(*$1, *$2, $4); }
;
extern_decl : TEXTERN ident ident TLPAREN func_decl_args TRPAREN
{ $$ = new NExternDeclaration(*$2, *$3, *$5); delete $5; }
;
const_func_decl : TCONST ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
;
let_func_decl : TLET ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
;
func_decl : ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
{ $$ = new NFunctionDeclaration(*$1, *$2, *$5, *$8); delete $5; }
;
func_decl_args : /*blank*/ { $$ = new VariableList(); }
| var_decl { $$ = new VariableList(); $$->push_back($<var_decl>1); }
| func_decl_args TCOMMA var_decl { $1->push_back($<var_decl>3); }
;
ident : TIDENTIFIER { $$ = new NIdentifier(*$1); delete $1; }
;
string : TSTRING { $$ = new NString($1->c_str()); delete $1; }
;
numeric : TINTEGER { $$ = new NInteger(atol($1->c_str())); delete $1; }
| TDOUBLE { $$ = new NDouble(atof($1->c_str())); delete $1; }
;
expr : ident TEQUAL expr { $$ = new NAssignment(*$<ident>1, *$3); }
| ident TLPAREN call_args TRPAREN { $$ = new NMethodCall(*$1, *$3); delete $3; }
| ident { $<ident>$ = $1; }
| numeric
| expr TMUL expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TDIV expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TPLUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr TMINUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| expr comparison expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
| TLPAREN expr TRPAREN { $$ = $2; }
;
call_args : /*blank*/ { $$ = new ExpressionList(); }
| expr { $$ = new ExpressionList(); $$->push_back($1); }
| call_args TCOMMA expr { $1->push_back($3); }
;
comparison : TCEQ | TCNE | TCLT | TCLE | TCGT | TCGE;
%%
最后,这是我的代码生成字符串的C ++代码......
Value* NString::codeGen(CodeGenContext& context)
{
// Generate the type for the global var
ArrayType* ArrayTy_0 = ArrayType::get(IntegerType::get(getGlobalContext(), 8), value.size() +1 );
// create global var which holds the constant string.
GlobalVariable* gvar_array__str = new GlobalVariable(*context.module,
/*Type=*/ArrayTy_0,
/*isConstant=*/true,
GlobalValue::PrivateLinkage,
/*Initializer=*/0, // has initializer, specified below
".str");
gvar_array__str->setAlignment(1);
// create the contents for the string global.
Constant* const_array_str = ConstantDataArray::getString(getGlobalContext(), value);
// Initialize the global with the string
gvar_array__str->setInitializer(const_array_str);
// generate access pointer to the string
std::vector<Constant*> const_ptr_8_indices;
ConstantInt* const_int = ConstantInt::get(getGlobalContext(), APInt(64, StringRef("0"), 10));
const_ptr_8_indices.push_back(const_int);
const_ptr_8_indices.push_back(const_int);
Constant* const_ptr_8 = ConstantExpr::getGetElementPtr(ArrayTy_0, gvar_array__str, const_ptr_8_indices);
return const_ptr_8;
}
在我自己的语法中,当我运行... int myInt = 123
工作正常时,string myString = "123"
会触发语法错误。