(这与我之前的question和此answer有关,也是我努力使用Python代码并理解列表,元组和正则表达式的一部分。 )
给出下面的示例代码(从here修改),
import collections
import re
import string
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'),
('ASSIGN', r':='),
('END', r';'),
('ID', r'[A-Za-z]+'),
('OP', r'[+\-*/]'),
('NEWLINE', r'\n'),
('SKIP', r'[ \t]+'),
('MISMATCH',r'.'),
]
# modified token_specification to insert a single-character
# element between the tokens and their regexes
token_specification = [(left, letter, right) for (left, right), letter in zip(token_specification, string.ascii_uppercase)]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup
value = mo.group(kind)
if kind == 'NEWLINE':
line_start = mo.end()
line_num += 1
elif kind == 'SKIP':
pass
else:
if kind == 'ID' and value in keywords:
kind = value
column = mo.start() - line_start
yield Token(kind, value, line_num, column)
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF; '''
for token in tokenize(statements):
print(token)
我尝试修改tok_regex
以匹配新的token_specification
,如下所示:
tok_regex = '|'.join('(?P<%s>%s%s)' % pair for pair in token_specification)
但它没有适当地处理新的token_specification
元组;我不明白如何调整pair
以使输出成为预期的:
Token(typ='IF', value='IF', line=2, column=4)
Token(typ='ID', value='quantity', line=2, column=7)
Token(typ='THEN', value='THEN', line=2, column=16)
Token(typ='ID', value='total', line=3, column=8)
Token(typ='ASSIGN', value=':=', line=3, column=14)
Token(typ='ID', value='total', line=3, column=17)
Token(typ='OP', value='+', line=3, column=23)
Token(typ='ID', value='price', line=3, column=25)
Token(typ='OP', value='*', line=3, column=31)
Token(typ='ID', value='quantity', line=3, column=33)
Token(typ='END', value=';', line=3, column=41)
Token(typ='ID', value='tax', line=4, column=8)
Token(typ='ASSIGN', value=':=', line=4, column=12)
Token(typ='ID', value='price', line=4, column=15)
Token(typ='OP', value='*', line=4, column=21)
Token(typ='NUMBER', value='0.05', line=4, column=23)
Token(typ='END', value=';', line=4, column=27)
Token(typ='ENDIF', value='ENDIF', line=5, column=4)
Token(typ='END', value=';', line=5, column=9)
为了提供一些上下文,我希望能够打印所有token
元素,包括有关其相应单字符字母的信息;为了实现这一点,我还应该将Token
namedtuple修改为Token = collections.namedtuple('Token', ['typ', 'scl', 'value', 'line', 'column'])
吗?
请丢失时提供一些建议。