我想解析包含分号;
的Python代码,用于分隔命令并生成用换行符\n
替换它们的代码。例如,来自
def main():
a = "a;b"; return a
我想制作
def main():
a = "a;b"
return a
任何提示?
答案 0 :(得分:4)
使用tokenize
library查找token.OP
tokens,其中第二个元素是;
* 。用token.NEWLINE
token替换这些令牌。
您也需要调整令牌偏移并生成匹配缩进;因此,在NEWLINE
之后,您需要调整行号(按照您插入的每NEWLINE
增加的偏移量增加)和' next' line(当前行的其余部分)必须调整索引以匹配当前缩进级别:
import tokenize
TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat
def semicolon_to_newline(tokens):
line_offset = 0
last_indent = None
col_offset = None # None or an integer
for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
slno, elno = slno + line_offset, elno + line_offset
if ttype in (tokenize.INDENT, tokenize.DEDENT):
last_indent = ecol # block is indented to this column
elif ttype == tokenize.OP and tstr == ';':
# swap out semicolon with a newline
ttype = tokenize.NEWLINE
tstr = '\n'
line_offset += 1
if col_offset is not None:
scol, ecol = scol - col_offset, ecol - col_offset
col_offset = 0 # next tokens should start at the current indent
elif col_offset is not None:
if not col_offset:
# adjust column by starting column of next token
col_offset = scol - last_indent
scol, ecol = scol - col_offset, ecol - col_offset
if ttype == tokenize.NEWLINE:
col_offset = None
yield TokenInfo(
ttype, tstr, (slno, scol), (elno, ecol), line)
with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
generator = tokenize.generate_tokens(source.readline)
dest.write(tokenize.untokenize(semicolon_to_newline(generator)))
请注意,我无需更正line
值;它只是提供信息,在取消标记时,实际上并未使用从文件中读取的数据。
演示:
>>> from io import StringIO
>>> source = StringIO('''\
... def main():
... a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
a = "a;b"
return a
稍微复杂一点:
>>> source = StringIO('''\
... class Foo(object):
... def bar(self):
... a = 10; b = 11; c = 12
... if self.spam:
... x = 12; return x
... x = 15; return y
...
... def baz(self):
... return self.bar;
... # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
def bar(self):
a = 10
b = 11
c = 12
if self.spam:
x = 12
return x
x = 15
return y
def baz(self):
return self.bar
# note, nothing after the semicolon
>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y
....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon
* {3}的Python 3版本输出更具信息性的tokenize
名为元组,它们具有额外的TokenInfo
属性,可用于代替文字匹配:exact_type
。我保持上面与Python 2和3兼容。
答案 1 :(得分:1)
这是一个pyparsing解决方案 - 请参阅以下代码中的注释:
from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line
SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)
def split_at(s, locs):
"""
break up s into pieces, given list of break locations
"""
current = 0
ret = []
for loc in locs:
ret.append(s[current:loc].lstrip())
current = loc+1
ret.append(s[current:].lstrip())
return ret
def split_on_semicolon(s,l,tokens):
"""
parse time callback, when finding first unquoted ';' on a line
"""
current_line = line(l,s)
line_body = current_line.lstrip()
indent = current_line.index(line_body)
indent = current_line[:indent]
# may be more than one ';' on this line, find them all
# (the second token contains everything after the ';')
remainder = tokens[1]
if remainder.strip():
all_semis = [s for _,s,_ in SEMI.scanString(remainder)]
# break line into pieces
pieces = split_at(remainder, all_semis)
# rejoin pieces, with leading indents
return '\n'+'\n'.join(indent+piece for piece in pieces)
else:
return ''
patt.addParseAction(split_on_semicolon)
sample = """
def main():
this_semi_does_nothing();
neither_does_this_but_there_are_spaces_afterward();
a = "a;b"; return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000;b("; in quotes"); c=200;return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10; b = 11; c = 12
# this comment; has several; semicolons
if self.spam:
x = 12; return x # so; does; this; one
x = 15;;; y += x; return y
def baz(self):
return self.bar
"""
print(patt.transformString(sample))
给出:
def main():
this_semi_does_nothing()
neither_does_this_but_there_are_spaces_afterward()
a = "a;b"
return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000
b("; in quotes")
c=200
return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10
b = 11
c = 12
# this comment; has several; semicolons
if self.spam:
x = 12
return x # so; does; this; one
x = 15
y += x
return y
def baz(self):
return self.bar