我有一个格式如下的文件:
X ={
a= "someText";
b = 0;
c = 1;
d ={
t = "someText3";
};
f ="someText2";
};
X ={
a= "someText4";
b = 20;
c = 40;
f ="someText6";
d ={
t = "someText5";
};
};
我正在寻找一种智能而强大的方法将其解析为dict列表,如下所示:
X[0] = {'a':"someText",'b':0, 'c':0, 'd':{ 't':'SomeText3' }, 'f':"someText2"}
X[1] = {'a':"someText4",'b':20, 'c':40, 'd':{ 't':'SomeText5' }, 'f':"someText6"}
请注意,可能存在嵌套字典,并且变量可以具有不同的出现顺序。
我的方法是通过搜索'= {'和'};来跟踪关卡。并构建列表。我想知道是否有一种优雅的方法来解析它。
答案 0 :(得分:2)
下面的简单解析器在简单的dictionionary方案上实现recursive descent algorithm:
import re
from collections import namedtuple
s = """
X ={
a= "someText";
b = 0;
c = 1;
d ={
t = "someText3";
};
f ="someText2";
};
"""
s1 = """
X ={
a= "someText4";
b = 20;
c = 40;
f ="someText6";
d ={
t = "someText5";
};
};
"""
token = namedtuple('token', ['type', 'value'])
class Parser:
lang = r'"[a-zA-Z0-9]+"|[a-zA-Z]+|\d+|\{|\};'
token_types = {'int':'\d+', 'key':'[a-zA-Z]+', 'start':'{', 'end':'};'}
def __init__(self, s):
self.starting_with = Parser.tokenize(s)[1:-1]
self.tokens = iter(Parser.tokenize(s)[1:-1])
self.starts = []
self.ends = []
self.k_list = []
self.k = None
self.d = {}
self.current_d = {}
def parse(self):
current = next(self.tokens, None)
if current:
if current.type == 'start':
self.starts.append(current.value)
self.parse()
if current.type == 'key':
self.k = current.value
self.k_list.append(self.k)
self.parse()
if current.type not in ['start', 'end', 'key']:
if len(self.starts) == 1:
self.d[self.k] = current.value[1:-1] if current.value.startswith('"') and current.value.endswith('"') else current.value
self.parse()
else:
self.current_d[self.k_list[-1]] = current.value[1:-1] if current.value.startswith('"') and current.value.endswith('"') else current.value
self.parse()
if current.type == 'end':
end = self.starts.pop()
self.d[self.k_list[-len(self.starts)-1]] = self.current_d
self.current_d = {}
self.parse()
@classmethod
def tokenize(cls, s):
return [token('string' if i.startswith('"') and i.endswith('"') else [a for a, b in cls.token_types.items() if re.findall(b, i)][0], i) for i in re.findall(cls.lang, s)]
dictionaries = [s, s1]
X = []
for d in dictionaries:
p = Parser(d)
p.parse()
X.append(p.d)
print(X[0])
print(X[1])
输出:
{'a': 'someText', 'c': '1', 'b': '0', 'd': {'t': 'someText3'}, 'f': 'someText2'}
{'a': 'someText4', 'c': '40', 'b': '20', 'd': {'t': 'someText5'}, 'f': 'someText6'}
答案 1 :(得分:1)
使用doctrine documentation,您可以在不使用智商170的情况下执行此操作。请注意,我发现学习它需要一些时间。
我已经在七行中定义了输入的语法。 result
用于容纳pyparsing发现的标记件。然后代码的最后几行根据解析的项目构建您想要的内容。包含previous
的代码位构成了我需要的可怕的kluge,因为我的语法两次找到var
个元素。也许你可以找到这个缺陷?
input = '''\
X ={
a= "someText";
b = 0;
c = 1;
d ={
t = "someText3";
};
f ="someText2";
};
X ={
a= "someText4";
b = 20;
c = 40;
f ="someText6";
d ={
t = "someText5";
};
};'''
import pyparsing as pp
result = []
var = pp.Word(pp.alphas).setParseAction(lambda s: result.append(('var', s[0])))
equals = pp.Literal('=')
semicolon = pp.Literal(';')
a_string = pp.QuotedString('"').setParseAction(lambda s: result.append(('string', s[0])))
number = pp.Word(pp.nums).setParseAction(lambda s: result.append(('number', s[0])))
open_curly = pp.Literal('{').setParseAction(lambda s: result.append(('dict_open', None)))
close_curly = pp.Literal('}').setParseAction(lambda s: result.append(('dict_close', None)))
one_dict = pp.Forward()
simple = var + equals + pp.Or([a_string, number]) + semicolon
declaration = one_dict | simple
one_dict << var + equals + open_curly + pp.OneOrMore(declaration) + close_curly + semicolon
dict_list = pp.OneOrMore(one_dict)
dict_list.parseString(input)
count = 0
previous = None
for item in result:
if item[0] == 'var':
if item[1] == 'X':
print ('\nX[{:d}] = '.format(count), end='')
count += 1
else:
if item == previous:
continue
print ('{}: '.format(item[1]), end='')
previous = item
elif item[0] == 'dict_open':
print ('{ ', end='')
elif item[0] == 'dict_close':
print ('}', end='')
elif item[0] == 'number':
print ('{}, '.format(item[1]), end='')
elif item[0] == 'string':
print ('"{}", '.format(item[1]), end='')
else:
pass
print ()
结果:
X[0] = { a: "someText", b: 0, c: 1, d: { t: "someText3", }f: "someText2", }
X[1] = { a: "someText4", b: 20, c: 40, f: "someText6", d: { t: "someText5", }}
编辑:如果字典可能为空,则替换上面代码中的以下行。
one_dict << var + equals + open_curly + pp.ZeroOrMore(declaration) + close_curly + semicolon
答案 2 :(得分:0)
我觉得plex在这里更容易申请。只需要扫描八个表达式。
from io import StringIO
input = StringIO(
'''X ={
a= "someText";
b = 0;
c = 1;
d ={
t = "someText3";
};
f ="someText2";
};
X ={
a= "someText4";
b = 20;
c = 40;
f ="someText6";
d ={
t = "someText5";
};
};''')
from plex import *
from io import StringIO
space = Any(' \t\n')
lexicon = Lexicon([
(Rep1(Range('AZaz')), 'var'),
(Str('"') + Rep(AnyBut('"')) + Str('"'), 'quoted'),
(Rep1(Range('09')), 'number'),
(space, IGNORE),
(Str('='), IGNORE),
(Str(';'), IGNORE),
(Str('{'), 'open_curly'),
(Str('}'), 'close_curly'),
])
scanner = Scanner(lexicon, input)
count = 0
while True:
token = scanner.read()
if token[0] is None:
break
elif token[0] in ['var', 'number']:
if token[1]=='X':
print ('\nX[{:d}] = '.format(count),end='')
count += 1
else:
print ('{}: '.format(token[1]),end='')
elif token[0]=='quoted':
print('{}, '.format(token[1]), end='')
elif token[0] == 'open_curly':
print ('{} '.format(token[1]), end='')
elif token[0] == 'close_curly':
print ('{}, '.format(token[1]), end='')
else:
pass
print ()
结果:
X[0] = { a: "someText", b: 0: c: 1: d: { t: "someText3", }, f: "someText2", },
X[1] = { a: "someText4", b: 20: c: 40: f: "someText6", d: { t: "someText5", }, },
它的重要缺点是它仅为Py2 AFAIK分发。但是,能够在大约两个小时内使它适用于Py3。
答案 3 :(得分:0)
这是一个使用parsy的实现(它的工作方式类似于pyparsing,但更现代,文档更好,通常会产生更简洁的代码,但需要Python 3.3或更高版本):
from collections import defaultdict
from parsy import generate, regex, seq, string, whitespace
lexeme = lambda parser: whitespace.optional() >> parser << whitespace.optional()
variable = lexeme(regex(r"[A-Za-z]+"))
string_literal = lexeme(string('"') >> regex(r'[^"]*') << string('"'))
int_literal = lexeme(regex(r'[0-9]+').map(int))
@generate
def value():
return (yield dict_literal | string_literal | int_literal)
statement = seq(variable << lexeme(string("=")),
value << lexeme(string(";")))
dict_literal = lexeme(string("{")) >> statement.many().map(dict) << lexeme(string("}"))
file_format = statement.many()
def parse(text_input):
output = defaultdict(list)
for key, val in file_format.parse(text_input):
output[key].append(val)
return dict(output)
您的示例的输出:
{'X': [{'a': 'someText',
'b': 0,
'c': 1,
'd': {'t': 'someText3'},
'f': 'someText2'},
{'a': 'someText4',
'b': 20,
'c': 40,
'd': {'t': 'someText5'},
'f': 'someText6'}]}
解析由file_format.parse
完成,我添加的parse
函数然后将该基本解析组合成一个字典,每个顶级变量有多个条目,并返回该值。根据您的示例,它并不完全打印,因为如果您想使用Python中的值,那可能不是您所需要的。
您可能需要根据需要进行调整。此外,您可能需要根据实际规则调整所有子解析器(例如,变量名称是否包含数字?字符串文字是否存在转义?)。