多行注释语法问题-LEX / YACC

时间:2018-10-22 03:04:25

标签: compiler-construction yacc lex context-free-grammar

因此,基本上,我只是想忽略lex中的注释,而根本不将注释传递给yacc。由于某些原因,当我有多行注释时,我的解析器只会在不打印任何内容时将其打印出来。

这是我遇到的问题: enter image description here

老实说,我不确定发生了什么。这里什么都不会打印出来。为什么打印注释?我的lex文件中的语法错误吗?

这是我的lex文件:

 %{
/*constants are defined outside of the l file in y.tab.h
*constants are defined from 257
*/

#include "y.tab.h"
int input_line_no = 1;
char the_tokens[1000];
char full_line[1000];
int lex_state = 0;

%}

whitespace         [ \t]
number             [0-9]
letter             [A-Za-z]
alfanum            [A-Za-z0-9_]
intcon             {number}+
id                 {letter}{alfanum}*
anything           .

%option noyywrap
 /*
 *Start conditions are specified to identify comments, 
 *literal strings, and literal chars. 
 */

%Start  string_in char_in 

%x COMMENT
%%

 /*identify comment*/
^[\t]*"{".*"}"[\t]*\n ;
^[\t]*"/*" {lex_state = 1; BEGIN COMMENT;}
^[\t]*"/*".*"*/"[\t]*\n ;

<COMMENT>"*/"[\t]*\n {lex_state=0; BEGIN 0;}
<COMMENT>"*/" {lex_state=0; BEGIN 0;}
<COMMENT>\n ;
<COMMENT>.\n ;

 /*tokenization of special strings*/
"extern"        {return EXTERN;}
"if"            {return IF;}
"else"          {return ELSE;}
"void"          {return VOID;}
"char"          {return CHAR;}
"int"           {return INT;}


 /*line number is recorded*/
[\n]                        input_line_no++;


 /*start tokenization of strings*/
<INITIAL>\"             {
                lex_state = 2;
                                BEGIN(string_in);

                        }
<string_in>[^"]     {
                return(STRINGCON);
            }
<string_in>\"       {
                lex_state = 0;
                BEGIN(INITIAL);
            }
 /*tokenization of characters*/
<INITIAL>\' {
            lex_state = 3;
            BEGIN(char_in);
        }
<char_in>[^']
        {
            return(CHARCON);
        }
<char_in>\\n    {
            return(CHARCON);
        }
<char_in>\\0    {
            return(CHARCON);
        }
<char_in>\' {
            lex_state = 0;
            BEGIN(INITIAL);
        }

{whitespace}    ;

 /*tokenization of numbers*/
{intcon}         {return(INTCON);}
{id}        {return ID;}

 /*tokenization of operations*/
"=="        {return EQUALS;}
"!="        {return NOTEQU;}
">="        {return GREEQU;}
"<="        {return LESEQU;}
">"     {return GREATE;}
"<"     {return LESSTH;}

"&&"        {return ANDCOM;}
"||"        {return ORCOMP;}
"!"             {return ABANG;}

";"     {return SEMIC;}
","     {return COMMA;}
"("     {return LPAR;}
")"     {return RPAR;}      
"["     {return LBRAC;}
"]"     {return RBRAC;}
"{"     {return LCURL;}
"}"     {return RCURL;}

"+"     {return ADD;}
"-"     {return SUB;}
"*"     {return MUL;}
"/"     {return DIV;}
"="     {return EQUAL;}

 /*For strings that can not be identified by any patterns specified previously
 *lex returns the value of the character
 */

{anything}     {return(OTHER);}

%%

这是我的yacc文件:

%{

#include <stdio.h>
#define YDEBUG
#ifndef YDEBUG

#define Y_DEBUG_PRINT(x)

#else

#define Y_DEBUG_PRINT(x) printf("Yout %s \n ",x)

#endif
int yydebug = 0; 

extern char the_token[]; 
 /* This is how I read tokens from lex... :) */
extern int input_line_no; 
 /* This is the current line number */
extern char *full_line; 
 /* This is the full line */
extern int lex_state;


%}

%token STRINGCON CHARCON INTCON EQUALS NOTEQU GREEQU LESEQU GREATE LESSTH
%token ANDCOM ORCOMP SEMIC COMMA LPAR RPAR LBRAC RBRAC LCURL RCURL ABANG
%token EQUAL ADD SUB MUL DIV ID EXTERN FOR WHILE RETURN IF ELSE 
%token VOID CHAR INT OTHER

%left ORCOMP
%left ANDCOM
%left EQUALS NOTEQU
%left LESSTH GREATE LESEQU GREEQU
%left ADD SUB
%left MUL DIV
%right UMINUS
%right ABANG

%start prog
%%

prog:
| dcl SEMIC prog2
| Function prog2 

prog2:
| dcl SEMIC prog2 
| Function  prog2 

dcl: VAR_list 
| ID LPAR Param_types RPAR dcl2 
| EXTERN ID LPAR Param_types RPAR dcl2 
| EXTERN Type ID LPAR Param_types RPAR dcl2 
| EXTERN VOID ID LPAR Param_types RPAR dcl2 
| Type ID LPAR Param_types RPAR dcl2 
| VOID ID LPAR Param_types RPAR dcl2 

dcl2: 
| COMMA ID LPAR Param_types RPAR dcl2 

Function: Functionhead LCURL Functionbody RCURL 
| VOID Functionhead LCURL Functionbody RCURL 
| Type Functionhead LCURL Functionbody RCURL 

Functionhead: ID LPAR Param_types RPAR 

Functionbody: 
|VAR_list STMT_list 

Param_types: VOID 
|Param_types1 

Param_types1: Param_type1 
| Param_types1 COMMA Param_type1 

Param_type1: Type ID Param_type11 

Param_type11: 
| LBRAC RBRAC 

VAR_list: Type VAR_list2 

VAR_list2: var_decl 
| var_decl COMMA VAR_list2 

var_decl: ID 
| ID LBRAC INTCON RBRAC 

Type: CHAR 
|INT

STMT_list: STMT2 

STMT2: STMT 
| STMT STMT2 

STMT : IF LPAR Expr RPAR STMT 
| IF LPAR Expr RPAR STMT ELSE STMT
 /*if cats) ERROR*/
| IF Expr RPAR STMT ELSE STMT {warn("STMT-IF: missing LPAR");}
 /*if (cats ERROR*/
| IF LPAR Expr STMT ELSE STMT {warn("STMT-IF: missing RPAR");}
 /*two elses ERROR*/
| IF LPAR Expr STMT ELSE ELSE STMT {warn(":too many elses");}
| WHILE LPAR Expr RPAR STMT
 /*for(c=0;c<1;c++)*/
| FOR LPAR Assign SEMIC Expr SEMIC Assign RPAR STMT 
 /*for(;c<1;c++)*/
| FOR LPAR SEMIC Expr SEMIC Assign RPAR STMT 
 /*for(;;c++)*/
| FOR LPAR SEMIC SEMIC Assign RPAR STMT 
 /*for(;;)*/
| FOR LPAR SEMIC SEMIC RPAR STMT 
 /*for(c=0;;)*/
| FOR LPAR Assign SEMIC SEMIC RPAR STMT 
 /*for(c=0;c<1;)*/
| FOR LPAR Assign SEMIC Expr SEMIC RPAR STMT 
 /*for(c=0;;c++)*/
| FOR LPAR Assign SEMIC SEMIC Assign RPAR STMT 
 /*for(;c<1;)*/
| FOR LPAR SEMIC Expr SEMIC RPAR STMT 
 /*for() ERROR*/
| FOR LPAR RPAR STMT {warn("STMT-FOR: empty statement");}
 /*for{;;;) ERROR*/
| FOR LPAR SEMIC SEMIC SEMIC RPAR {warn("STMT-FOR: too many semicolons");}
 /*for;;) ERROR*/
| FOR SEMIC SEMIC RPAR STMT {warn("STMT-FOR: missing LPAR");}
 /*for(;; ERROR*/   
| FOR LPAR SEMIC SEMIC STMT {warn("STMT-FOR: missing RPAR");}
| RETURN Expr SEMIC 
| RETURN SEMIC 
 /*return ERROR*/
| RETURN {warn("STMT-Return:missing semicolon");}
| Assign SEMIC 
/*function call*/
| ID LPAR RPAR SEMIC 
| ID LPAR Expr Expr2 RPAR SEMIC 
 /*No semic ERROR*/
| ID LPAR Expr Expr2 RPAR {warn(":missing semicolon");}  
| LCURL STMT2 RCURL 
| LCURL RCURL 
| SEMIC

Assign : ID Assign1 EQUAL Expr 
 /*Error no semi*/
| Assign {warn( "Assign: missing semicolon on line");}

Assign1 : 
| LBRAC Expr RBRAC
| LBRAC Expr error { warn("Assign1: missing RBRAC"); }
| error Expr RBRAC { warn("Assign1: missing LBRAC"); }
| LBRAC error RBRAC { warn("Assign1: Invalid array index"); }

Expr : SUB Expr %prec UMINUS
| ABANG Expr 
| Expr Binop Expr 
| Expr Relop Expr
| Expr Logop Expr 
| ID 
| ID LPAR RPAR 
| ID LPAR Expr Expr2 RPAR 
| ID LBRAC Expr RBRAC 
| LPAR Expr RPAR 
| INTCON 
| CHARCON 
| STRINGCON 
| Array 
| error {warn("Expr: invalid expression "); }

/*top is for no expression 2*/
Expr2: 
| COMMA Expr 
 /*recursively looks for another expression in function call (exp1,exp2,exp3,...*/
| COMMA Expr Expr2


Array : 
ID LBRAC Expr RBRAC 
| ID error RBRAC {warn( "Array: invalid array expression"); }

Binop : ADD 
| SUB 
| MUL 
| DIV 

Logop : ANDCOM 
| ORCOMP 

Relop : EQUALS 

| NOTEQU 

| LESEQU 

| GREEQU 

| GREATE 

| LESSTH 


%%

main()
{
int result = yyparse();
if (lex_state==1) {
yyerror("End of file within a comment");
}
if (lex_state==2) {
yyerror("End of file within a string");
}
return result;
} 
int yywrap(){
return 1;
}
yyerror(const char *s)
{
fprintf(stderr, "%s on line %d\n",s,input_line_no);
} 
warn(char *s)
{
fprintf(stderr, "%s\n", s);
}

这是我要运行的测试:

/* function definitions interspersed with global declarations and
   function prototypes */

void a( void ), b(int x), c(int x, int y, int z);

int a1( void ), b1(int x), c1(int x, char y, char z, int w);
int x, y[10], z;
int x0, y0, z0[20];

void foo0( void ) {}

void foo1( int x ) {}

char u0, u1[10];
char a2( void ), b2(char x), c2(char x, char y, char z, int w);

extern int a3( void ), b3(int x), c3(int x, char y, char z, int w);

extern char a4( void ), b4(char x), c4(char x, char y, char z, int w);

void foo2( int x, int y, int z ) {}

int foo3( int x[], char y, int z[], char w[] ) {}

int x1, x2[100], x3, x4, x5[1000];
int b5(int x[]), c5(int x, char y[], char z, int w[], int u[], int v);

char b6(char x[]), c6(char x, char y[], char z[], int w);

char foo4( int x[], char y, int z[], char w[] ) {}

extern int a7( void ), b7(int x[]), c7(int x[], char y, char z[], int w[]);

extern char a8( void ), b8(char x[]), c8(char x, char y[], char z, int w[]);

我尝试重写语法以评论,但是我似乎什么也没得到,只有我需要做些什么。任何帮助将不胜感激,谢谢!

2 个答案:

答案 0 :(得分:0)

(F)lex自动添加默认的后备规则

<*>.|\n        ECHO;

在规则集的末尾,因此规则无法识别的任何字符都将打印在标准输出上。这就是您所看到的。

这种行为很少是您在解析器中想要的,并且我几乎总是以以下方式启动我的flex文件

%option nodefault

[注1]

这将取消默认的回退规则,如果某个输入将使用该规则,则会产生警告。不幸的是,警告消息不是很清楚关于哪些输入可能无法匹配的信息,但是如果您忽略警告并使用生成的扫描程序,则会在运行时对不匹配的输入产生致命错误。

在这种情况下,很明显在COMMENT起始条件下注释的内容不匹配。也许您打算对第四条规则使用.|\n?尽管那样会使第三条规则变得多余。


注意:

  1. 实际上,我通常使用:

    %option nodefault noinput nounput noyywrap 8bit yylineno
    

    noinputnounput抑制未使用函数的编译器警告(因为我通常不使用这些函数); noyywrap避免了对yywrap的需求,因此flex会在看到输入结束时立即发送输入令牌的结尾,而yylineno告诉flex跟踪行号,这对于错误消息。

    只要您使用默认表设置,

    8bit就是默认值,但是如果您要求使用“快速”扫描仪,则如果输入包含大于127的字符代码,则默认值将更改为产生未定义的行为我发现很难对快速表选项进行时序测试,因此尽管我通常不使用该选项(它不会加快速度,并且会使表更大),但似乎谨慎考虑其他人可能想要的可能性。

答案 1 :(得分:0)

由于多种原因,您的块注释模式基本上是错误的...

通常,对于块注释,词法模式是这样的:

"/*"  { BEGIN COMMENT; }

<COMMENT>[^*/]+ { /* ignore anything that is not '*' or '/' */ }
<COMMENT>("*"+)"/" { BEGIN INITIAL; }
<COMMENT>[*/] { /* residual stuff */ }