努力掌握正则表达式,尤其是match.start()
和match.end()
方法。
在使用此代码时(找到here):
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ASSIGN', r':='), # Assignment operator
('END', r';'), # Statement terminator
('ID', r'[A-Za-z]+'), # Identifiers
('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs
('MISMATCH',r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup
value = mo.group(kind)
if kind == 'NEWLINE':
line_start = mo.end()
line_num += 1
elif kind == 'SKIP':
pass
elif kind == 'MISMATCH':
raise RuntimeError('%r unexpected on line %d' % (value, line_num))
else:
if kind == 'ID' and value in keywords:
kind = value
column = mo.start() - line_start
yield Token(kind, value, line_num, column)
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF;
'''
for token in tokenize(statements):
print(token)
使用mo.end()
和mo.start()
计算线条和列时无法理解使用和逻辑。 例如,如果我要让NEWLINE
和SKIP
也产生Token
输出,那么列索引会完全混乱。
尝试使用mo.end()
列索引重新计算以适应此示例中提到但失败的情况。
任何想法,示例代码和/或解释都会很棒。
答案 0 :(得分:2)
mo.start
和mo.end
会返回匹配的开始和结束索引,以便string[mo.start():mo.end()]
返回匹配的字符串。每当您的示例与\n
匹配时,它将增加跟踪当前行的line_num
并更新line_start
以包含当前行中第一个字符的索引。这允许程序稍后在匹配令牌时计算列:column = mo.start() - line_start
。
为了说明行和列跟踪行为,我创建了一个查找给定字符串中所有数字的简单示例。对于每个数字,它将输出行和起始列:
import re
PATTERN = '(?P<NEWLINE>\n)|(?P<NUMBER>\d+)'
s = '''word he12re 5 there
mo912re
another line 17
'''
line = 1
line_start = 0
for mo in re.finditer(PATTERN, s):
if mo.lastgroup == 'NEWLINE':
# Found new line, increase line number and change line_start to
# contain index of first character on the line
line += 1
line_start = mo.end()
elif mo.lastgroup == 'NUMBER':
# Column: index of start of the match - index of first char on line
column = mo.start() - line_start
print('line {0}: {1} at column {2}'.format(line, mo.group(0), column))
输出:
line 1: 12 at column 7
line 1: 5 at column 12
line 2: 912 at column 2
line 3: 17 at column 13
答案 1 :(得分:1)
这是我认为符合您标准的实施方案:如果您可以发布您尝试过的内容,或许我们可以更好地了解您遇到的问题。
import collections
import re
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ASSIGN', r':='), # Assignment operator
('END', r';'), # Statement terminator
('ID', r'[A-Za-z]+'), # Identifiers
('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs
('MISMATCH',r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup
value = mo.group(kind)
column = (mo.start() - line_start) + 1
if kind == 'MISMATCH':
raise RuntimeError('%r unexpected on line %d' % (value, line_num))
else:
if kind == 'ID' and value in keywords:
kind = value
yield Token(kind, value, line_num, column)
if kind == 'NEWLINE':
line_start = mo.end()
line_num += 1
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF;
'''
for token in tokenize(statements):
print(token)
输出:
Token(typ='NEWLINE', value='\n', line=1, column=1)
Token(typ='SKIP', value=' ', line=2, column=1)
Token(typ='IF', value='IF', line=2, column=5)
Token(typ='SKIP', value=' ', line=2, column=7)
Token(typ='ID', value='quantity', line=2, column=8)
Token(typ='SKIP', value=' ', line=2, column=16)
Token(typ='THEN', value='THEN', line=2, column=17)
Token(typ='SKIP', value=' ', line=2, column=21)
Token(typ='NEWLINE', value='\n', line=2, column=22)
Token(typ='SKIP', value=' ', line=3, column=1)
Token(typ='ID', value='total', line=3, column=9)
Token(typ='SKIP', value=' ', line=3, column=14)
Token(typ='ASSIGN', value=':=', line=3, column=15)
Token(typ='SKIP', value=' ', line=3, column=17)
Token(typ='ID', value='total', line=3, column=18)
Token(typ='SKIP', value=' ', line=3, column=23)
Token(typ='OP', value='+', line=3, column=24)
Token(typ='SKIP', value=' ', line=3, column=25)
Token(typ='ID', value='price', line=3, column=26)
Token(typ='SKIP', value=' ', line=3, column=31)
Token(typ='OP', value='*', line=3, column=32)
Token(typ='SKIP', value=' ', line=3, column=33)
Token(typ='ID', value='quantity', line=3, column=34)
Token(typ='END', value=';', line=3, column=42)
Token(typ='NEWLINE', value='\n', line=3, column=43)
Token(typ='SKIP', value=' ', line=4, column=1)
Token(typ='ID', value='tax', line=4, column=9)
Token(typ='SKIP', value=' ', line=4, column=12)
Token(typ='ASSIGN', value=':=', line=4, column=13)
Token(typ='SKIP', value=' ', line=4, column=15)
Token(typ='ID', value='price', line=4, column=16)
Token(typ='SKIP', value=' ', line=4, column=21)
Token(typ='OP', value='*', line=4, column=22)
Token(typ='SKIP', value=' ', line=4, column=23)
Token(typ='NUMBER', value='0.05', line=4, column=24)
Token(typ='END', value=';', line=4, column=28)
Token(typ='NEWLINE', value='\n', line=4, column=29)
Token(typ='SKIP', value=' ', line=5, column=1)
Token(typ='ENDIF', value='ENDIF', line=5, column=5)
Token(typ='END', value=';', line=5, column=10)
Token(typ='NEWLINE', value='\n', line=5, column=11)