我已经成功地为解析器创建了一个扫描程序,然后它将与代码生成器一起使用来创建一个完整的编译器。我的解析器现在可以成功解析分配,if, if-else, while-do
,比较,read and write
命令。我遇到的唯一问题是在while-do
或if/if-else
失败后尝试分配内容时。
更新:我实际上可以在if\if else
和while-do
语句后分配变量,但是找到了另一个问题。我无法制作任何嵌套的if/if else
或while-do
语句。所以现在问题是如何修复嵌套的布尔和循环语句。
我的解析器使用if-then
语句的方式是:
if z < 4 then
z := z + 7
end
写出来的是:
Statements
If
LESS
z
4
Statements
Assign
z
ADD
z
7
它识别if并运行一个名为if_statement()
的方法,它扫描if
标记,将比较设置为变量,扫描then
标记,然后设置语句变量。接下来,为了适应if
和if-else
语句,我使用一个名为lookahead()的方法来查看下一个标记。如果它看到end
,则返回要写入文本文档的变量。如果它看到else
,则扫描else
令牌,然后将语句分配给变量,然后查找end
令牌。然后它返回要写入文本文档的所有变量。
此方法的代码如下:
def if_statement():
scanner.consume(Token.IF)
condition = comparison()
scanner.consume(Token.THEN)
then = statements()
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
else:
scanner.consume(Token.ELSE)
otherwise = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, otherwise)
If_AST和If_Else_AST如下:
class If_AST:
def __init__(self, condition, then):
self.condition = condition
self.then = then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1)
class If_Else_AST:
def __init__(self, condition, then, otherwise):
self.condition = condition;
self.then = then;
self.otherwise = otherwise;
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' +\
repr(self.then) + ' else ' + \
repr(self.otherwise) + ' end'
def indented(self, level):
return indent('If-Else', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1) + \
self.otherwise.indented(level+1)
对于while-do
,它会将if
替换为while
,将then
替换为do
。但是为了更好的衡量,我还将包括while方法和类:
def while_statement():
scanner.consume(Token.WHILE)
condition = comparison()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
class While_AST:
def __init__(self, condition, body):
self.condition = condition
self.body = body
def __repr__(self):
return 'while ' + repr(self.condition) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While', level) + \
self.condition.indented(level+1) + \
self.body.indented(level+2)
我意识到我的解析器不支持else if
语句。 Else if
语句不是此练习的一部分。
最后,为了防止有人需要查看所有内容,到目前为止我的完整代码如下:
import re
import sys
class Scanner:
'''The interface comprises the methods lookahead and consume.
Other methods should not be called from outside of this class.'''
def __init__(self, input_file):
'''Reads the whole input_file to input_string, which remains constant.
current_char_index counts how many characters of input_string have
been consumed.
current_token holds the most recently found token and the
corresponding part of input_string.'''
# source code of the program to be compiled
self.input_string = input_file.read()
# index where the unprocessed part of input_string starts
self.current_char_index = 0
# a pair (most recently read token, matched substring of input_string)
self.current_token = self.get_token()
def skip_white_space(self):
'''Consumes all characters in input_string up to the next
non-white-space character.'''
if (self.current_char_index >= len(self.input_string) - 1):
return
while self.input_string[self.current_char_index].isspace():
self.current_char_index += 1
def get_token(self):
'''Returns the next token and the part of input_string it matched.
The returned token is None if there is no next token.
The characters up to the end of the token are consumed.'''
self.skip_white_space()
# find the longest prefix of input_string that matches a token
token, longest = None, ''
for (t, r) in Token.token_regexp:
match = re.match(r, self.input_string[self.current_char_index:])
if match and match.end() > len(longest):
token, longest = t, match.group()
# consume the token by moving the index to the end of the matched part
self.current_char_index += len(longest)
return (token, longest)
def lookahead(self):
'''Returns the next token without consuming it.
Returns None if there is no next token.'''
return self.current_token[0]
def consume(self, *tokens):
'''Returns the next token and consumes it, if it is in tokens.
Raises an exception otherwise.
If the token is a number or an identifier, its value is returned
instead of the token.'''
current = self.current_token
if (len(self.input_string[self.current_char_index:]) == 0):
self.current_token = (None, '')
else:
self.current_token = self.get_token()
if current[0] in tokens:
if current[0] is Token.ID:
#print("ID")
return current[1]
elif current[0] is Token.NUM:
#print("NUM")
return current[1]
else:
return current[0]
else:
raise Exception('non-token detected')
class Token:
# The following enumerates all tokens.
DO = 'DO'
ELSE = 'ELSE'
READ = 'READ'
WRITE = 'WRITE'
END = 'END'
IF = 'IF'
THEN = 'THEN'
WHILE = 'WHILE'
SEM = 'SEM'
BEC = 'BEC'
LESS = 'LESS'
EQ = 'EQ'
GRTR = 'GRTR'
LEQ = 'LEQ'
NEQ = 'NEQ'
GEQ = 'GEQ'
ADD = 'ADD'
SUB = 'SUB'
MUL = 'MUL'
DIV = 'DIV'
LPAR = 'LPAR'
RPAR = 'RPAR'
NUM = 'NUM'
ID = 'ID'
# The following list gives the regular expression to match a token.
# The order in the list matters for mimicking Flex behaviour.
# Longer matches are preferred over shorter ones.
# For same-length matches, the first in the list is preferred.
token_regexp = [
(DO, 'do'),
(ELSE, 'else'),
(READ, 'read'),
(WRITE, 'write'),
(END, 'end'),
(IF, 'if'),
(THEN, 'then'),
(WHILE, 'while'),
(SEM, ';'),
(BEC, ':='),
(LESS, '<'),
(EQ, '='),
(NEQ, '!='),
(GRTR, '>'),
(LEQ, '<='),
(GEQ, '>='),
(ADD, '[+]'), # + is special in regular expressions
(SUB, '-'),
(MUL, '[*]'),
(DIV, '[/]'),
(LPAR, '[(]'), # ( is special in regular expressions
(RPAR, '[)]'), # ) is special in regular expressions
(ID, '[a-z]+'),
(NUM, '[0-9]+'),
]
def indent(s, level):
return ' '*level + s + '\n'
# Each of the following classes is a kind of node in the abstract syntax tree.
# indented(level) returns a string that shows the tree levels by indentation.
class Program_AST:
def __init__(self, program):
self.program = program
def __repr__(self):
return repr(self.program)
def indented(self, level):
return self.program.indented(level)
class Statements_AST:
def __init__(self, statements):
self.statements = statements
def __repr__(self):
result = repr(self.statements[0])
for st in self.statements[1:]:
result += '; ' + repr(st)
return result
def indented(self, level):
result = indent('Statements', level)
for st in self.statements:
result += st.indented(level+1)
return result
class If_AST:
def __init__(self, condition, then):
self.condition = condition
self.then = then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1)
class If_Else_AST:
def __init__(self, condition, then, otherwise):
self.condition = condition;
self.then = then;
self.otherwise = otherwise;
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' +\
repr(self.then) + ' else ' + \
repr(self.otherwise) + ' end'
def indented(self, level):
return indent('If-Else', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1) + \
self.otherwise.indented(level+1)
class While_AST:
def __init__(self, condition, body):
self.condition = condition
self.body = body
def __repr__(self):
return 'while ' + repr(self.condition) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While', level) + \
self.condition.indented(level+1) + \
self.body.indented(level+2)
class Assign_AST:
def __init__(self, identifier, expression):
self.identifier = identifier
self.expression = expression
def __repr__(self):
return repr(self.identifier) + ':=' + repr(self.expression)
def indented(self, level):
return indent('Assign', level) + \
self.identifier.indented(level+1) + \
self.expression.indented(level+1)
class Write_AST:
def __init__(self, expression):
self.expression = expression
def __repr__(self):
return 'write ' + repr(self.expression)
def indented(self, level):
return indent('Write', level) + self.expression.indented(level+1)
class Read_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return 'read ' + repr(self.identifier)
def indented(self, level):
return indent('Read', level) + self.identifier.indented(level+1)
class Comparison_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.LESS:'<', Token.EQ:'=', Token.GRTR:'>',
Token.LEQ:'<=', Token.NEQ:'!=', Token.GEQ:'>=' }
return repr(self.left) + op[self.op] + repr(self.right)
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
class Expression_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.ADD:'+', Token.SUB:'-', Token.MUL:'*', Token.DIV:'/' }
return '(' + repr(self.left) + op[self.op] + repr(self.right) + ')'
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
class Number_AST:
def __init__(self, number):
self.number = number
def __repr__(self):
return self.number
def indented(self, level):
return indent(self.number, level)
class Identifier_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return self.identifier
def indented(self, level):
return indent(self.identifier, level)
# The following methods comprise the recursive-descent parser.
def program():
sts = statements()
return Program_AST(sts)
def statements():
result = [statement()]
while scanner.lookahead() == Token.SEM:
scanner.consume(Token.SEM)
st = statement()
result.append(st)
return Statements_AST(result)
def statement():
if scanner.lookahead() == Token.IF:
return if_statement()
elif scanner.lookahead() == Token.WHILE:
return while_statement()
elif scanner.lookahead() == Token.ID:
return assignment()
elif scanner.lookahead() == Token.READ:
return read();
elif scanner.lookahead() == Token.WRITE:
return write();
else: # error
return scanner.consume(Token.IF, Token.WHILE, Token.ID)
def if_statement():
scanner.consume(Token.IF)
condition = comparison()
scanner.consume(Token.THEN)
then = statements()
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
else:
scanner.consume(Token.ELSE)
otherwise = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, otherwise)
def while_statement():
scanner.consume(Token.WHILE)
condition = comparison()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
def assignment():
ident = identifier()
scanner.consume(Token.BEC)
expr = expression()
return Assign_AST(ident, expr)
def read():
scanner.consume(Token.READ)
i = identifier()
return Read_AST(i)
def comparison():
left = expression()
op = scanner.consume(Token.LESS, Token.EQ, Token.GRTR,
Token.LEQ, Token.NEQ, Token.GEQ)
right = expression()
return Comparison_AST(left, op, right)
def write():
scanner.consume(Token.WRITE)
e = expression()
return Write_AST(e)
def expression():
result = term()
while scanner.lookahead() in [Token.ADD, Token.SUB]:
op = scanner.consume(Token.ADD, Token.SUB)
tree = term()
result = Expression_AST(result, op, tree)
return result
def term():
result = factor()
while scanner.lookahead() in [Token.MUL, Token.DIV]:
op = scanner.consume(Token.MUL, Token.DIV)
tree = factor()
result = Expression_AST(result, op, tree)
return result
def factor():
if scanner.lookahead() == Token.LPAR:
scanner.consume(Token.LPAR)
result = expression()
scanner.consume(Token.RPAR)
return result
elif scanner.lookahead() == Token.NUM:
value = scanner.consume(Token.NUM)
return Number_AST(value)
elif scanner.lookahead() == Token.ID:
return identifier()
else: # error
return scanner.consume(Token.LPAR, Token.NUM, Token.ID)
def identifier():
value = scanner.consume(Token.ID)
return Identifier_AST(value)
# Initialise scanner.
scanner = Scanner(sys.stdin)
# Show all tokens in the input.
#token = scanner.lookahead()
#test = ''
#
#while token != None:
# print(scanner.consume(token))
# token = scanner.lookahead()
#
#Call the parser.
ast = program()
if scanner.lookahead != None:
print(ast.indented(0))
exit()
#if scanner.lookahead() != None:
# raise Exception('end of input expected but token ' + repr(scanner.lookahead()) + ' found')
# Show the syntax tree with levels indicated by indentation.
有关如何修复代码的任何提示都很棒。谢谢!
答案 0 :(得分:0)
问题是我原来的扫描仪只打印了ID和NUM令牌,从未真正读过它们。因此,通过更改我的扫描程序以使其不仅仅return "NUM " + current[1]
,我将current[1]
分配给变量,以便稍后在consume()
方法结束时返回它。通过这种方式,解析器能够在不打印的情况下查看它是ID还是NUM。
我还更改了if_statement()
和while_statement()
方法,以检查下一个标记是end
,else
,while
还是if
。在找不到任何这些内容时,方法是假设下一个标记是一个赋值,因此搜索Token.ID
或Token.NUM
。