尝试使用BISON,Flex和LLVM解析字符串

时间:2017-01-24 09:33:20

标签: c++ llvm bison lex

我正在玩Flex,BISON和LLVM,创建我自己的编程语言,以了解编译器如何更好地工作。我已经有一个基本的解析器工作,并且在这一系列博客文章http://gnuu.org/2009/09/18/writing-your-own-toy-compiler/

中有一个语法

但是,我无法弄清楚如何扩展它以包含字符串,例如string myVar = "testing 123"

这是我的代币清单......

[ \t\n]                         ;
"extern"                return TOKEN(TEXTERN);
"return"                        return TOKEN(TRETURN);
[a-zA-Z_][a-zA-Z0-9_]*  SAVE_TOKEN; return TIDENTIFIER;
[0-9]+\.[0-9]*              SAVE_TOKEN; return TDOUBLE;
[0-9]+                          SAVE_TOKEN; return TINTEGER;
\"[^\n"]+\"             SAVE_TOKEN; return TSTRING;

"="                               return TOKEN(TEQUAL);
"=="                            return TOKEN(TCEQ);
"!="                            return TOKEN(TCNE);
"<"                             return TOKEN(TCLT);
"<="                            return TOKEN(TCLE);
">"                             return TOKEN(TCGT);
">="                              return TOKEN(TCGE);

"("                             return TOKEN(TLPAREN);
")"                             return TOKEN(TRPAREN);
"{"                                 return TOKEN(TLBRACE);
"}"                             return TOKEN(TRBRACE);
"=>"                    return TOKEN(TCLO);
"co"                    return TOKEN(TCO);
"const"                 return TOKEN(TCONST);
"let"                   return TOKEN(TLET);

"."                                 return TOKEN(TDOT);
","                             return TOKEN(TCOMMA);

"+"                             return TOKEN(TPLUS);
"-"                             return TOKEN(TMINUS);
"*"                             return TOKEN(TMUL);
"/"                             return TOKEN(TDIV);

.                       printf("Unknown token!\n"); yyterminate();

这是我的解析器......

%{
    #include "node.h"
        #include <cstdio>
        #include <cstdlib>
    NBlock *programBlock; /* the top level root node of our final AST */

    extern int yylex();
    void yyerror(const char *s) { std::printf("Error: %s\n", s);std::exit(1); }
%}

/* Represents the many different ways we can access our data */
%union {
    Node *node;
    NBlock *block;
    NExpression *expr;
    NStatement *stmt;
    NIdentifier *ident;
    NVariableDeclaration *var_decl;
    std::vector<NVariableDeclaration*> *varvec;
    std::vector<NExpression*> *exprvec;
    std::string *string;
    int token;
}

/* Define our terminal symbols (tokens). This should
   match our tokens.l lex file. We also define the node type
   they represent.
 */
%token <string> TIDENTIFIER TINTEGER TDOUBLE TSTRING
%token <token> TCEQ TCNE TCLT TCLE TCGT TCGE TEQUAL
%token <token> TLPAREN TRPAREN TLBRACE TRBRACE TCOMMA TDOT TCLO TCO TCONST TLET
%token <token> TPLUS TMINUS TMUL TDIV
%token <token> TRETURN TEXTERN

/* Define the type of node our nonterminal symbols represent.
   The types refer to the %union declaration above. Ex: when
   we call an ident (defined by union type ident) we are really
   calling an (NIdentifier*). It makes the compiler happy.
 */
%type <ident> ident
%type <expr> numeric expr string
%type <varvec> func_decl_args
%type <exprvec> call_args
%type <block> program stmts block
%type <stmt> stmt var_decl func_decl extern_decl const_func_decl let_func_decl
%type <token> comparison

/* Operator precedence for mathematical operators */
%left TPLUS TMINUS
%left TMUL TDIV

%start program

%%

program : stmts { programBlock = $1; }
        ;

stmts : stmt { $$ = new NBlock(); $$->statements.push_back($<stmt>1); }
      | stmts stmt { $1->statements.push_back($<stmt>2); }
      ;

stmt : var_decl | func_decl | extern_decl | const_func_decl | let_func_decl
     | expr { $$ = new NExpressionStatement(*$1); }
     | TRETURN expr { $$ = new NReturnStatement(*$2); }
     ;

block : TLBRACE stmts TRBRACE { $$ = $2; }
      | TLBRACE TRBRACE { $$ = new NBlock(); }
      ;

var_decl : ident ident { $$ = new NVariableDeclaration(*$1, *$2); }
         | ident ident TEQUAL expr { $$ = new NVariableDeclaration(*$1, *$2, $4); }
         ;

extern_decl : TEXTERN ident ident TLPAREN func_decl_args TRPAREN
                { $$ = new NExternDeclaration(*$2, *$3, *$5); delete $5; }
            ;

const_func_decl : TCONST ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
                { $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
                ;

let_func_decl : TLET ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
              { $$ = new NFunctionDeclaration(*$2, *$3, *$6, *$9); delete $6; }
              ;

func_decl : ident ident TEQUAL TLPAREN func_decl_args TRPAREN TCLO block
            { $$ = new NFunctionDeclaration(*$1, *$2, *$5, *$8); delete $5; }
          ;

func_decl_args : /*blank*/  { $$ = new VariableList(); }
          | var_decl { $$ = new VariableList(); $$->push_back($<var_decl>1); }
          | func_decl_args TCOMMA var_decl { $1->push_back($<var_decl>3); }
          ;

ident : TIDENTIFIER { $$ = new NIdentifier(*$1); delete $1; }
      ;

string : TSTRING { $$ = new NString($1->c_str()); delete $1; }
      ;


numeric : TINTEGER { $$ = new NInteger(atol($1->c_str())); delete $1; }
        | TDOUBLE { $$ = new NDouble(atof($1->c_str())); delete $1; }
        ;

expr : ident TEQUAL expr { $$ = new NAssignment(*$<ident>1, *$3); }
     | ident TLPAREN call_args TRPAREN { $$ = new NMethodCall(*$1, *$3); delete $3; }
     | ident { $<ident>$ = $1; }
     | numeric
         | expr TMUL expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
         | expr TDIV expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
         | expr TPLUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
         | expr TMINUS expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
     | expr comparison expr { $$ = new NBinaryOperator(*$1, $2, *$3); }
     | TLPAREN expr TRPAREN { $$ = $2; }
     ;

call_args : /*blank*/  { $$ = new ExpressionList(); }
          | expr { $$ = new ExpressionList(); $$->push_back($1); }
          | call_args TCOMMA expr  { $1->push_back($3); }
          ;

comparison : TCEQ | TCNE | TCLT | TCLE | TCGT | TCGE;

%%

最后,这是我的代码生成字符串的C ++代码......

Value* NString::codeGen(CodeGenContext& context)
{
    // Generate the type for the global var
    ArrayType* ArrayTy_0 = ArrayType::get(IntegerType::get(getGlobalContext(), 8), value.size() +1 );

    // create global var which holds the constant string.
    GlobalVariable* gvar_array__str = new GlobalVariable(*context.module,
                                                         /*Type=*/ArrayTy_0,
                                                         /*isConstant=*/true,
                                                         GlobalValue::PrivateLinkage,
                                                         /*Initializer=*/0, // has initializer, specified below
                                                         ".str");
    gvar_array__str->setAlignment(1);
    // create the contents for the string global.
    Constant* const_array_str =  ConstantDataArray::getString(getGlobalContext(), value);
    // Initialize the global with the string
    gvar_array__str->setInitializer(const_array_str);

    // generate access pointer to the string
    std::vector<Constant*> const_ptr_8_indices;
    ConstantInt* const_int = ConstantInt::get(getGlobalContext(), APInt(64, StringRef("0"), 10));
    const_ptr_8_indices.push_back(const_int);
    const_ptr_8_indices.push_back(const_int);
    Constant* const_ptr_8 = ConstantExpr::getGetElementPtr(ArrayTy_0, gvar_array__str, const_ptr_8_indices);
    return const_ptr_8;
}

在我自己的语法中,当我运行... int myInt = 123工作正常时,string myString = "123"会触发语法错误。

0 个答案:

没有答案