ANTLR4中的缩进管理用于python解释器

时间:2018-03-24 19:22:58

标签: java python indentation antlr4

我正在使用类似lexer和解析器生成器的ANTLR4实现python解释器。我使用了此链接定义的BNF: https://github.com/antlr/grammars-v4/blob/master/python3/Python3.g4。 但是,当我定义复合语句时,在lexer :: members中使用INDENT和DEDENT标记进行缩进的实现不起作用。 例如,如果我定义以下语句:

x=10
while x>2 :
      print("hello")
            x=x-3

所以当我重新分配x变量的值时,我应该有一个缩进错误,我在当前状态下没有。 我应该在词法分析器代码中编辑某些内容还是什么? 这是我使用的lexer :: members和上面链接中定义的NEWLINE规则的BNF。

grammar python;

tokens { INDENT, DEDENT }

@lexer::members {

  // A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
  private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();

  // The stack that keeps track of the indentation level.
  private java.util.Stack<Integer> indents = new java.util.Stack<>();

  // The amount of opened braces, brackets and parenthesis.
  private int opened = 0;

  // The most recently produced token.
  private Token lastToken = null;

  @Override
  public void emit(Token t) {
    super.setToken(t);
    tokens.offer(t);
  }

  @Override
  public Token nextToken() {

    // Check if the end-of-file is ahead and there are still some DEDENTS expected.
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) {

      // Remove any trailing EOF tokens from our buffer.
      for (int i = tokens.size() - 1; i >= 0; i--) {
        if (tokens.get(i).getType() == EOF) {
          tokens.remove(i);
        }
      }

      // First emit an extra line break that serves as the end of the statement.
      this.emit(commonToken(pythonParser.NEWLINE, "\n"));

      // Now emit as much DEDENT tokens as needed.
      while (!indents.isEmpty()) {
        this.emit(createDedent());
        indents.pop();
      }

      // Put the EOF back on the token stream.
      this.emit(commonToken(pythonParser.EOF, "<EOF>"));
      //throw new Exception("indentazione inaspettata in riga "+this.getLine());
    }

    Token next = super.nextToken();

    if (next.getChannel() == Token.DEFAULT_CHANNEL) {
      // Keep track of the last token on the default channel.
      this.lastToken = next;
    }

    return tokens.isEmpty() ? next : tokens.poll();
  }

  private Token createDedent() {
    CommonToken dedent = commonToken(pythonParser.DEDENT, "");
    dedent.setLine(this.lastToken.getLine());
    return dedent;
  }

  private CommonToken commonToken(int type, String text) {
    int stop = this.getCharIndex() - 1;
    int start = text.isEmpty() ? stop : stop - text.length() + 1;
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
  }

  // Calculates the indentation of the provided spaces, taking the
  // following rules into account:
  //
  // "Tabs are replaced (from left to right) by one to eight spaces
  //  such that the total number of characters up to and including
  //  the replacement is a multiple of eight [...]"
  //
  //  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
  static int getIndentationCount(String spaces) {

    int count = 0;

    for (char ch : spaces.toCharArray()) {
      switch (ch) {
        case '\t':
          count += 8 - (count % 8);
          break;
        default:
          // A normal space char.
          count++;
      }
    }

    return count;
  }

  boolean atStartOfInput() {
    return super.getCharPositionInLine() == 0 && super.getLine() == 1;
    }
}

parse
 :( NEWLINE parse 
 | block ) EOF 
 ;

block
 : (statement NEWLINE?| functionDecl)*  
 ;

statement
 : assignment 
 | functionCall 
 | ifStatement
 | forStatement
 | whileStatement
 | arithmetic_expression
 ;

assignment
 : IDENTIFIER indexes? '=' expression 
 ;

functionCall
 : IDENTIFIER OPAREN exprList? CPAREN   #identifierFunctionCall
 | PRINT OPAREN? exprList? CPAREN?     #printFunctionCall
 ;

 arithmetic_expression
 : expression
 ;

ifStatement
 : ifStat elifStat* elseStat? 
 ;

ifStat
 : IF expression COLON NEWLINE INDENT block DEDENT
 ;

elifStat
 : ELIF expression COLON NEWLINE INDENT block DEDENT 
 ;

elseStat
 : ELSE COLON NEWLINE INDENT block DEDENT
 ;

functionDecl
 : DEF IDENTIFIER OPAREN idList? CPAREN COLON NEWLINE INDENT block DEDENT 
 ;

forStatement
 : FOR IDENTIFIER IN expression COLON NEWLINE INDENT block DEDENT elseStat?
 ;

whileStatement
 : WHILE expression COLON NEWLINE INDENT block DEDENT elseStat?
 ;

idList
 : IDENTIFIER (',' IDENTIFIER)*
 ;

exprList
 : expression (COMMA expression)*
 ;

expression
 : '-' expression                           #unaryMinusExpression
 | '!' expression                           #notExpression
 | expression '**' expression               #powerExpression
 | expression '*' expression                #multiplyExpression
 | expression '/' expression                #divideExpression
 | expression '%' expression                #modulusExpression
 | expression '+' expression                #addExpression
 | expression '-' expression                #subtractExpression
 | expression '>=' expression               #gtEqExpression
 | expression '<=' expression               #ltEqExpression
 | expression '>' expression                #gtExpression
 | expression '<' expression                #ltExpression
 | expression '==' expression               #eqExpression
 | expression '!=' expression               #notEqExpression
 | expression '&&' expression               #andExpression
 | expression '||' expression               #orExpression
 | expression '?' expression ':' expression #ternaryExpression
 | expression IN expression                 #inExpression
 | NUMBER                                   #numberExpression
 | BOOL                                     #boolExpression
 | NULL                                     #nullExpression
 | functionCall indexes?                    #functionCallExpression
 | list indexes?                            #listExpression
 | IDENTIFIER indexes?                      #identifierExpression
 | STRING indexes?                          #stringExpression
 | '(' expression ')' indexes?              #expressionExpression
 | INPUT '(' STRING? ')'                    #inputExpression
 ;

list
 : '[' exprList? ']'
 ;

indexes
 : ('[' expression ']')+
 ;

PRINT    : 'print';
INPUT    : 'input';
DEF      : 'def';
IF       : 'if';
ELSE     : 'else';
ELIF     : 'elif';
RETURN   : 'return';
FOR      : 'for';
WHILE    : 'while';
IN       : 'in';
NULL     : 'null';

OR       : '||';
AND      : '&&';
EQUALS   : '==';
NEQUALS  : '!=';
GTEQUALS : '>=';
LTEQUALS : '<=';
POW      : '**';
EXCL     : '!';
GT       : '>';
LT       : '<';
ADD      : '+';
SUBTRACT : '-';
MULTIPLY : '*';
DIVIDE   : '/';
MODULE  : '%';
OBRACE   : '{' {opened++;};
CBRACE   : '}' {opened--;};
OBRACKET : '[' {opened++;};
CBRACKET : ']' {opened--;};
OPAREN   : '(' {opened++;};
CPAREN   : ')' {opened--;};
SCOLON   : ';';
ASSIGN   : '=';
COMMA    : ',';
QMARK    : '?';
COLON    : ':';

BOOL
 : 'true' 
 | 'false'
 ;

NUMBER
 : INT ('.' DIGIT*)?
 ;

IDENTIFIER
 : [a-zA-Z_] [a-zA-Z_0-9]*
 ;

STRING
 : ["] (~["\r\n] | '\\\\' | '\\"')* ["]
 | ['] (~['\r\n] | '\\\\' | '\\\'')* [']
 ;

 SKIPS
 : ( SPACES | COMMENT | LINE_JOINING ){firstLine();} -> skip
 ;

 NEWLINE
 : ( {atStartOfInput()}?   SPACES
   | ( '\r'? '\n' | '\r' | '\f' ) SPACES?
   )
   {
     String newLine = getText().replaceAll("[^\r\n\f]+", "");
     String spaces = getText().replaceAll("[\r\n\f]+", "");
     int next = _input.LA(1);

     if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
       // If we're inside a list or on a blank line, ignore all indents, 
       // dedents and line breaks.
       skip();
     }
     else {
       emit(commonToken(NEWLINE, newLine));

       int indent = getIndentationCount(spaces);
       int previous = indents.isEmpty() ? 0 : indents.peek();

       if (indent == previous) {
         // skip indents of the same size as the present indent-size
         skip();
       }
       else if (indent > previous) {
         indents.push(indent);
         emit(commonToken(pythonParser.INDENT, spaces));
       }
       else {
         // Possibly emit more than 1 DEDENT token.
         while(!indents.isEmpty() && indents.peek() > indent) {
           this.emit(createDedent());
           indents.pop();
         }
       }
     }
   }
 ;

fragment INT
 : [1-9] DIGIT*
 | '0'
 ;

fragment DIGIT 
 : [0-9]
 ;

 fragment SPACES
 : [ \t]+
 ;

 fragment COMMENT
 : '#' ~[\r\n\f]*
 ;

 fragment LINE_JOINING
 : '\\' SPACES? ( '\r'? '\n' | '\r' | '\f' )
 ; 

1 个答案:

答案 0 :(得分:0)

不,这不应该在语法中处理。词法分析器应该只发出(错误的)INDENT令牌。解析器应该在运行时产生错误。像这样:

String source = "x=10\n" +
        "while x>2 :\n" +
        "    print(\"hello\")\n" +
        "        x=x-3\n";

Python3Lexer lexer = new Python3Lexer(CharStreams.fromString(source));
Python3Parser parser = new Python3Parser(new CommonTokenStream(lexer));

// Remove default error-handling
parser.removeErrorListeners();

// Add custom error-handling
parser.addErrorListener(new BaseErrorListener() {
  @Override
  public void syntaxError(Recognizer<?, ?> recognizer, Object o, int i, int i1, String s, RecognitionException e) {

    CommonToken token = (CommonToken) o;

    if (token.getType() == Python3Parser.INDENT) {
      // The parser encountered an unexpected INDENT token
      // TODO throw your exception
    }

    // TODO handle other errors
  }
});

// Trigger the error
parser.file_input();