给出下面的示例代码(从here修改),
import collections
import re
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'),
('ASSIGN', r':='),
('END', r';'),
('ID', r'[A-Za-z]+'),
('OP', r'[+\-*/]'),
('NEWLINE', r'\n'),
('SKIP', r'[ \t]+'),
('MISMATCH',r'.'),
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup
value = mo.group(kind)
if kind == 'NEWLINE':
line_start = mo.end()
line_num += 1
elif kind == 'SKIP':
pass
else:
if kind == 'ID' and value in keywords:
kind = value
column = mo.start() - line_start
yield Token(kind, value, line_num, column)
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF; '''
for token in tokenize(statements):
print(token)
我希望能够在token_specification
和正则表达式中的令牌值之间插入单个字符串(但不 硬编码),如下所示:
token_specification = [
('NUMBER', 'A', r'\d+(\.\d*)?'),
('ASSIGN', 'B', r':='),
('END', 'C', r';'),
('ID', 'D', r'[A-Za-z]+'),
('OP', 'E', r'[+\-*/]'),
('NEWLINE', 'F', r'\n'),
('SKIP', 'G', r'[ \t]+'),
('MISMATCH', 'H', r'.'),
]
我考虑过ascii_uppercase
,但我无法使用token_specification
中自动添加该单个字符的实际语法。
有人可以提供一些指导吗?
答案 0 :(得分:3)
只需在初始token_specification
声明后添加此内容:
token_specification = [(left, letter, right) for (left, right), letter in zip(token_specification, string.ascii_uppercase)]
(以及导入的import string
)
答案 1 :(得分:0)
您可以将元组转换为列表并在中间插入字符然后转换回元组
import string
chars = string.ascii_uppercase
result = []
for i, t in enumerate(token_specification):
t_lst = list(t)
t_lst.insert(1, chars[i])
result.append(tuple(t_lst))
答案 2 :(得分:0)
这样你就不会用完字母,首先制作一个可以产生字母分组字母序列的迭代器:
from string import ascii_uppercase
from itertools import product, imap
# Produce letter groups: A, B, C, ..., Z, AA, AB, ...
charset = [''] + list(ascii_uppercase)
characters = imap(''.join, product(charset, repeat=4))
_ = next(characters) # Consume initial blank value
接下来,使用zip()将字母与 token_specification 中的名称和模式交错:
result = [(name, letters, pattern) for (name, pattern), letters
in zip(token_specification, characters)]