yacc解析器产生意外的字符。太奇怪了

时间:2014-01-05 15:47:24

标签: compilation yacc lex

首先,我将语法和词汇文件附在此处以供参考:

grammar.y

%{
#include <stdio.h>
extern int yylineno;
int yylex ();
int yyerror ();
//extern char* yytext;

%}

%union{
    int integer;
    float flt;
    char *str;
}

%token <str> IDENTIFIER 
%token <flt> CONSTANTF
%token <integer> CONSTANTI
%token LEFT_BRACKET RIGHT_BRACKET
%token EQ INC_OP DEC_OP LE_OP GE_OP EQ_OP NE_OP

%token SUB_ASSIGN MUL_ASSIGN ADD_ASSIGN
%token TYPE_NAME
%token INT FLOAT VOID
%token IF ELSE WHILE RETURN FOR
%start program
%%

primary_expression
: IDENTIFIER    {printf("use identifier: %s, length is %d\n", $1, strlen($1));}
| CONSTANTI     {printf("use constant Int: %d\n", $1);}
| CONSTANTF     {printf("use constant Float: %f\n", $1);}
| LEFT_BRACKET expression RIGHT_BRACKET
| IDENTIFIER LEFT_BRACKET RIGHT_BRACKET {printf("non-param methodCall identifier is: %s\n", $1);}
| IDENTIFIER LEFT_BRACKET argument_expression_list RIGHT_BRACKET    {printf("param methodCall identifier is: %s\n", $1);}
| IDENTIFIER INC_OP
| IDENTIFIER DEC_OP
;

postfix_expression
: primary_expression
| postfix_expression '[' expression ']'
;

argument_expression_list
: expression
| argument_expression_list ',' expression
;

unary_expression
: postfix_expression
| INC_OP unary_expression
| DEC_OP unary_expression
| unary_operator unary_expression
;

unary_operator
: '-'
;

multiplicative_expression
: unary_expression
| multiplicative_expression '*' unary_expression
| multiplicative_expression '/' unary_expression
;

additive_expression
: multiplicative_expression
| additive_expression '+' multiplicative_expression
| additive_expression '-' multiplicative_expression
;

comparison_expression
: additive_expression
| additive_expression '<' additive_expression
| additive_expression '>' additive_expression
| additive_expression LE_OP additive_expression
| additive_expression GE_OP additive_expression
| additive_expression EQ_OP additive_expression
| additive_expression NE_OP additive_expression
;

expression
: unary_expression assignment_operator comparison_expression
| comparison_expression
;

assignment_operator
: EQ
| MUL_ASSIGN
| ADD_ASSIGN
| SUB_ASSIGN
;

declaration
: type_name declarator_list ';'
;

declarator_list
: declarator
| declarator_list ',' declarator
;

type_name
: VOID  
| INT   
| FLOAT
;

declarator
: IDENTIFIER    {printf("declare an identifer: %s\n",$1);}
| LEFT_BRACKET declarator RIGHT_BRACKET
| declarator '[' CONSTANTI ']'
| declarator '[' ']'
| declarator LEFT_BRACKET parameter_list RIGHT_BRACKET
| declarator LEFT_BRACKET RIGHT_BRACKET
;

parameter_list
: parameter_declaration
| parameter_list ',' parameter_declaration
;

parameter_declaration
: type_name declarator
;

statement
: compound_statement
| expression_statement 
| selection_statement
| iteration_statement
| jump_statement
;

compound_statement
: '{' '}'
| '{' statement_list '}'
| '{' declaration_list statement_list '}'
;

declaration_list
: declaration
| declaration_list declaration
;

statement_list
: statement
| statement_list statement
;

expression_statement
: ';'
| expression ';'
;

selection_statement
: IF '(' expression ')' statement
| IF '(' expression ')' statement ELSE statement
| FOR '(' expression_statement expression_statement expression ')' statement
;

iteration_statement
: WHILE '(' expression ')' statement
;

jump_statement
: RETURN ';'
| RETURN expression ';'
;

program
: external_declaration
| program external_declaration
;

external_declaration
: function_definition
| declaration
;

function_definition
: type_name declarator compound_statement
;

%%
#include <stdio.h>
#include <string.h>

extern char yytext[];
extern int column;
extern int yylineno;
extern FILE *yyin;

char *file_name = NULL;

int yyerror (char *s) {
    fflush (stdout);
    fprintf (stderr, "%s:%d:%d: %s\n", file_name, yylineno, column, s);
    return 0;
}


int main (int argc, char *argv[]) {
    FILE *input = NULL;
    if (argc==2) {
    input = fopen (argv[1], "r");
    file_name = strdup (argv[1]);
    if (input) {
        yyin = input;
    }
    else {
      fprintf (stderr, "%s: Could not open %s\n", *argv, argv[1]);
        return 1;
    }
    }
    else {
    fprintf (stderr, "%s: error: no input file\n", *argv);
    return 1;
    }
    yyparse ();
    free (file_name);
    return 0;
}

scanner.l

%{
#include <stdio.h>
#include <string.h>
#include "grammar.tab.h"
void count();
int comment ();
int check_type ();
%}
D[0-9]
L[a-zA-Z_]
%option yylineno
%%
"/*"        { comment(); }
"//".*          { count(); }
"float"     { count(); return(FLOAT); }
"if"        { count(); return(IF); }
"else"      { count(); return(ELSE); }
"int"       { count(); return(INT); }
"return"    { count(); return(RETURN); }
"void"      { count(); return(VOID); }
"while"     { count(); return(WHILE); }
"for"       { count(); return(FOR); }
[$]?{L}({L}|{D})*   { count(); yylval.str=yytext; return(IDENTIFIER); printf("recognize identifier");}
{D}+        { count(); yylval.integer=atoi(yytext); return(CONSTANTI); }
{D}+"."{D}*     { count(); yylval.flt=atof(yytext); return(CONSTANTF); }
"("         {count();return(LEFT_BRACKET);}
")"         {count();return(RIGHT_BRACKET);}
"="         {count();return(EQ);}
"+="        { count(); return(ADD_ASSIGN); }
"-="        { count(); return(SUB_ASSIGN); }
"*="        { count(); return(MUL_ASSIGN); }
"++"        { count(); return(INC_OP); }
"--"        { count(); return(DEC_OP); }
"<="        { count(); return(LE_OP); }
">="        { count(); return(GE_OP); }
"=="        { count(); return(EQ_OP); }
"!="        { count(); return(NE_OP); }
";"     { count(); return(';'); }
"{"         { count(); return('{'); }
"}"         { count(); return('}'); }
","     { count(); return(','); }
"/"     { count(); return('/'); }
"["         { count(); return('['); }
"]"         { count(); return(']'); }
"."     { count(); return('.'); }
"!"     { count(); return('!'); }
"-"     { count(); return('-'); }
"+"     { count(); return('+'); }
"*"     { count(); return('*'); }
"<"     { count(); return('<'); }
">"     { count(); return('>'); }
[ \t\v\n\f] { count(); }
.       { /* ignore bad characters */ }

%%
int yywrap() {
    return 1;
}

int comment() {
    char c, c1;

loop:
    while ((c = input()) != '*' && c != 0);
    if ((c1 = input()) != '/' && c != 0) {
        unput(c1);
        goto loop;
    }
    return 0;
}

int column = 0;

void count() {
    int i;
    for (i = 0; yytext[i] != '\0'; i++) {
        if (yytext[i] == '\n')
            column = 0;
        else if (yytext[i] == '\t')
            column += 8 - (column % 8);
        else
            column++;
    }
}

Makefile:

LEX=lex
YACC=yacc
CFLAGS=-Wall
CC=gcc

all:parse

parse:grammar.c scanner.c
    $(CC) $(CFLAGS) -o $@ $^

grammar.c:grammar.y
    $(YACC) -d -o $@ --defines=grammar.tab.h $^

%.c:%.l
    $(LEX) -o $@ $^

clean:
    rm -f grammar.c scanner.c

=============================================== ===============================

问题在于: 当我解析输入文件,如:

int a;
int fhu;
float fs;

int drive(float te){
    int b;
    b=1;
    fhu = drive(fs);
    fs = 0.4;
    return 0;
}

我得到了以下输出:

declare an identifer: a
declare an identifer: fhu
declare an identifer: fs
declare an identifer: drive
declare an identifer: te
declare an identifer: b
use identifier: b=, length is 2
use constant Int: 1
use identifier: fhu =, length is 5
use identifier: fs), length is 3
param methodCall identifier is: drive(fs)
use identifier: fs =, length is 4
use constant Float: 0.400000
use constant Int: 0

我很困惑为什么“b =”,“fhu =”,“fs)”被意外的字符'=','='和')'识别出来。您可以在声明语句中看到标识符已正确识别,但不能在主语句中识别。

为什么他们使用相同的词汇规则但产生不同的结果?

有没有人可以帮我解决这个问题?我是yacc的新手,任何建议都可能有所帮助和欢迎!拜托,谢谢!

1 个答案:

答案 0 :(得分:2)

正如我在第一篇评论中提到的,问题与不存储词法扫描程序返回的字符串副本有关。我能够像描述的那样重现问题(使用Mac OS X 10.9.1 Mavericks而不是Ubuntu,所以它不是特定于平台的。)

'琐碎'修复是:

[$]?{L}({L}|{D})*   { count(); yylval.str=strdup(yytext); printf("recognize identifier (%s)\n", yylval.str); return(IDENTIFIER);}

那里有三处变化:

  1. 在返回前移动printf(),以便执行。
  2. printf()打印标识符字符串。
  3. 关键的一个:使用strdup()复制字符串!
  4. “琐碎”这个词用引号(两次),因为分配内存的直接问题是“它在哪里被释放”而当前答案是“无处可去 - 直到程序退出”,这不太可能是合适的长期解决方案。因此,您需要查看如何使用IDENTIFIER令牌类型返回的标识符,以确保释放内存。但这会让你回到正轨。

    在示例文件上运行parse的输出:

    recognize identifier (a)
    declare an identifer: a
    recognize identifier (fhu)
    declare an identifer: fhu
    recognize identifier (fs)
    declare an identifer: fs
    recognize identifier (drive)
    declare an identifer: drive
    recognize identifier (te)
    declare an identifer: te
    recognize identifier (b)
    declare an identifer: b
    recognize identifier (b)
    use identifier: b, length is 1
    use constant Int: 1
    recognize identifier (fhu)
    use identifier: fhu, length is 3
    recognize identifier (drive)
    recognize identifier (fs)
    use identifier: fs, length is 2
    param methodCall identifier is: drive
    recognize identifier (fs)
    use identifier: fs, length is 2
    use constant Float: 0.400000
    use constant Int: 0
    

      

    我想知道的最后一件事是,当代码直接将yytext分配给yylval.str时,为什么语法分析器会得到“b=”而不是“b “?在进行语法分析时,yytext如何改变?

    尝试在代码中添加以下内容 - 特别是push_identifier()中的dump_identifiers()grammary.y - 并同时使用'with strdup()'和'without {{ 1}}'strdup()的版本。

    scanner.l

    使用'with %{ #include <stdio.h> #include <string.h> extern int yylineno; int yylex(void); int yyerror(char *str); static void push_identifier(char *str); //extern char* yytext; %} %expect 1 %union{ int integer; float flt; char *str; } %token <str> IDENTIFIER %token <flt> CONSTANTF %token <integer> CONSTANTI %token LEFT_BRACKET RIGHT_BRACKET %token EQ INC_OP DEC_OP LE_OP GE_OP EQ_OP NE_OP %token SUB_ASSIGN MUL_ASSIGN ADD_ASSIGN %token TYPE_NAME %token INT FLOAT VOID %token IF ELSE WHILE RETURN FOR %start program %% primary_expression : IDENTIFIER {printf("use identifier: %s, length is %zu\n", $1, strlen($1)); push_identifier($1);} | CONSTANTI {printf("use constant Int: %d\n", $1);} | CONSTANTF {printf("use constant Float: %f\n", $1);} | LEFT_BRACKET expression RIGHT_BRACKET | IDENTIFIER LEFT_BRACKET RIGHT_BRACKET {printf("non-param methodCall identifier is: %s\n", $1); push_identifier($1);} | IDENTIFIER LEFT_BRACKET argument_expression_list RIGHT_BRACKET {printf("param methodCall identifier is: %s\n", $1); push_identifier($1);} | IDENTIFIER INC_OP { push_identifier($1); } | IDENTIFIER DEC_OP { push_identifier($1); } ; postfix_expression : primary_expression | postfix_expression '[' expression ']' ; argument_expression_list : expression | argument_expression_list ',' expression ; unary_expression : postfix_expression | INC_OP unary_expression | DEC_OP unary_expression | unary_operator unary_expression ; unary_operator : '-' ; multiplicative_expression : unary_expression | multiplicative_expression '*' unary_expression | multiplicative_expression '/' unary_expression ; additive_expression : multiplicative_expression | additive_expression '+' multiplicative_expression | additive_expression '-' multiplicative_expression ; comparison_expression : additive_expression | additive_expression '<' additive_expression | additive_expression '>' additive_expression | additive_expression LE_OP additive_expression | additive_expression GE_OP additive_expression | additive_expression EQ_OP additive_expression | additive_expression NE_OP additive_expression ; expression : unary_expression assignment_operator comparison_expression | comparison_expression ; assignment_operator : EQ | MUL_ASSIGN | ADD_ASSIGN | SUB_ASSIGN ; declaration : type_name declarator_list ';' ; declarator_list : declarator | declarator_list ',' declarator ; type_name : VOID | INT | FLOAT ; declarator : IDENTIFIER {printf("declare an identifer: %s\n",$1); push_identifier($1); } | LEFT_BRACKET declarator RIGHT_BRACKET | declarator '[' CONSTANTI ']' | declarator '[' ']' | declarator LEFT_BRACKET parameter_list RIGHT_BRACKET | declarator LEFT_BRACKET RIGHT_BRACKET ; parameter_list : parameter_declaration | parameter_list ',' parameter_declaration ; parameter_declaration : type_name declarator ; statement : compound_statement | expression_statement | selection_statement | iteration_statement | jump_statement ; compound_statement : '{' '}' | '{' statement_list '}' | '{' declaration_list statement_list '}' ; declaration_list : declaration | declaration_list declaration ; statement_list : statement | statement_list statement ; expression_statement : ';' | expression ';' ; selection_statement : IF '(' expression ')' statement | IF '(' expression ')' statement ELSE statement | FOR '(' expression_statement expression_statement expression ')' statement ; iteration_statement : WHILE '(' expression ')' statement ; jump_statement : RETURN ';' | RETURN expression ';' ; program : external_declaration | program external_declaration ; external_declaration : function_definition | declaration ; function_definition : type_name declarator compound_statement ; %% #include <stdio.h> #include <string.h> extern char yytext[]; extern int column; extern int yylineno; extern FILE *yyin; char *file_name = NULL; int yyerror(char *s) { fflush(stdout); fprintf(stderr, "%s:%d:%d: %s\n", file_name, yylineno, column, s); return 0; } static char *list[20]; static int sp = 0; static void push_identifier(char *str) { list[sp++] = str; // Appalling lack of error checking - not fit for production } static void dump_identifiers(void) { printf("Identifiers (%d):\n", sp); for (int i = 0; i < sp; i++) printf("[%2d] = <<%s>>\n", i, list[i]); } int main(int argc, char *argv[]) { FILE *input = NULL; if (argc == 2) { input = fopen(argv[1], "r"); file_name = strdup(argv[1]); if (input) { yyin = input; } else { fprintf(stderr, "%s: Could not open %s\n", *argv, argv[1]); return 1; } } else { fprintf(stderr, "%s: error: no input file\n", *argv); return 1; } yyparse(); dump_identifiers(); free(file_name); return 0; } '版本,输出结束:

    strdup()

    使用'without use constant Int: 0 Identifiers (11): [ 0] = <<a>> [ 1] = <<fhu>> [ 2] = <<fs>> [ 3] = <<drive>> [ 4] = <<te>> [ 5] = <<b>> [ 6] = <<b>> [ 7] = <<fhu>> [ 8] = <<fs>> [ 9] = <<drive>> [10] = <<fs>> '版本,输出结束:

    strdup()

    换句话说,扫描程序中的代码正在重用use constant Int: 0 Identifiers (11): [ 0] = <<a; int fhu; float fs; int drive(float te){ int b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 1] = <<fhu; float fs; int drive(float te){ int b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 2] = <<fs; int drive(float te){ int b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 3] = <<drive(float te){ int b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 4] = <<te){ int b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 5] = <<b; b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 6] = <<b=1; fhu = drive(fs); fs = 0.4; return 0; } >> [ 7] = <<fhu = drive(fs); fs = 0.4; return 0; } >> [ 8] = <<fs); fs = 0.4; return 0; } >> [ 9] = <<drive(fs); fs = 0.4; return 0; } >> [10] = <<fs = 0.4; return 0; } >> 为其自身目的指向的缓冲区。这就是最后发生的事情;我不确定在解析代码时发生了什么 - 每次调用yytext时都需要调用dump_identifiers()push_identifier()push_identifier()中的打印地址也可能很有启发性。