我是python的新手,我正尝试使用pyparsing解析一些看起来像这样的数据
string2 = """
object1 {
key1 = value1
key2 = value2
#key3 = value3
key4 = value4
#key5 = value5
key6 = value6
subobject1 {
key1 = value1
key2 = value2
key3 = value3
}
}
"""
并且我可以使用此代码获得键=值对
def parse_objects(source):
LBRACE,EQ,RBRACE,HASH = map(Suppress, '{=}#')
object_name = Word(printables)
#disable = MatchFirst(map(Literal, '#'.split()))
key = Word(printables)
value = Word(printables)
if LineStart() == HASH:
key_and_value = Group(HASH + key('key') + EQ + value('value'))
else:
key_and_value = Group(key('key') + EQ + value('value'))
collection = Forward()
object_body = Group(LBRACE + ZeroOrMore(collection | key_and_value) + RBRACE)
collection <<= Group(object_name + object_body)
return collection.parseString(source)
collection = parse_objects(string2)
print(collection.dump())
但是我还需要解析不包含对象值,仅包含键的数据。例如
object1 {
key1 = value1
key2
#key3 = value3
key4
#key5 = value5
key6 = value6
subobject1 {
key1 = value1
key2 = value2
key3 = value3
}
}
我试图更改代码并添加检查表达式if value is None
。
像这样
if value is None:
key_and_value = Group(key('key'))
else:
if LineStart() == HASH:
key_and_value = Group(HASH + key('key') + EQ + value('value'))
else:
key_and_value = Group(key('key') + EQ + value('value'))
但是我得到一个错误
Match W:(0123...) at loc 19(3,9)
Matched W:(0123...) -> ['key1']
Match W:(0123...) at loc 25(3,15)
Matched W:(0123...) -> ['value1']
Match W:(0123...) at loc 41(4,9)
Matched W:(0123...) -> ['key2']
Traceback (most recent call last):
File "c:\Python27\my_projects\test_parser.py", line 86, in <module>
collection = parse_objects(string2)
File "c:\Python27\my_projects\test_parser.py", line 84, in parse_objects
return collection.parseString(source)
File "C:\Python27\lib\site-packages\pyparsing.py", line 1632, in parseString
raise exc
ParseException: Expected "}" (at char 41), (line:4, col:9)
我认为pyparsing将键作为子对象,但是找不到{
。
谁能给我任何建议?也许我需要改变我的语法方法?
感谢您的帮助。
编辑1
@Jappy的解决方案非常适合我上面编写的数据,当subobject1部分位于主体部分的底部时。在分析我的数据之后,我发现在subobject1部分之后可能有更多的key = value对或仅是键,像这样:
string2 = """
object1 {
key1 = value1
key2
#key3 = value3
key4 = value4
subobject1 {
key1 = value1
key2 = value2
key3 = value3
}
#key5 = value5
key6 = v_a_l_u_e_6
subobject2 {
key1 = value1
}
key7 = value7
key8
}
"""
输出如下:
[['object1', ['key1', 'value1'], ['key2', 'null'], ['#key3', 'value3'], ['key4', 'value4'], ['subobject1', ['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']], ['#key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', ['key1', 'value1']], ['key7', 'value7'], ['key8', 'null']]]
- objects: ['object1', ['key1', 'value1'], ['key2', 'null'], ['#key3', 'value3'],
['key4', 'value4'], ['subobject1', ['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']], ['#key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', ['key1', 'value1']], ['key7', 'value7'], ['key8', 'null']]
- key_val_lines: [['key7', 'value7'], ['key8', 'null']]
[0]:
['key7', 'value7']
- key: 'key7'
- val: 'value7'
[1]:
['key8', 'null']
- key: 'key8'
- val: 'null'
- obj_name: 'object1'
- objects: ['subobject2', ['key1', 'value1']]
- key_val_lines: [['key1', 'value1']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
- obj_name: 'subobject2'
我这样更改了代码:
ParserElement.inlineLiteralsUsing(Suppress)
name_expr = Word(printables, excludeChars='{}')
key_val_expr = '=' + Word(printables)
key_val_line = Group(name_expr('key') + (lineEnd().setParseAction(lambda t: 'null') | key_val_expr)('val'))
#key_val_lines = OneOrMore(key_val_line)('key_val_lines')
obj = Forward()
objects = Group('{' + OneOrMore(key_val_line | obj) + '}')
obj << Group(name_expr('obj_name') + objects('objects'))
#obj << Group(name_expr('obj_name') + '{' + OneOrMore(key_val_lines | obj) + '}')('objects')
o = obj.parseString(string2)
print o.dump()
结果是:
[['object1', [['key1', 'value1'], ['key2', 'null'], ['#key3', 'value3'], ['key4',
'value4'], ['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]], ['#key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', [['key1', 'value1']]], ['key7', 'value7'], ['key8', 'null']]]]
[0]:
['object1', [['key1', 'value1'], ['key2', 'null'], ['#key3', 'value3'], ['key4', 'value4'], ['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]], ['#key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', [['key1', 'value1']]], ['key7', 'value7'], ['key8', 'null']]]
- obj_name: 'object1'
- objects: [['key1', 'value1'], ['key2', 'null'], ['#key3', 'value3'], ['key4',
'value4'], ['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]], ['#key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', [['key1', 'value1']]], ['key7', 'value7'], ['key8', 'null']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
[1]:
['key2', 'null']
- key: 'key2'
- val: 'null'
[2]:
['#key3', 'value3']
- key: '#key3'
- val: 'value3'
[3]:
['key4', 'value4']
- key: 'key4'
- val: 'value4'
[4]:
['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]]
- obj_name: 'subobject1'
- objects: [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
[1]:
['key2', 'value2']
- key: 'key2'
- val: 'value2'
[2]:
['key3', 'value3']
- key: 'key3'
- val: 'value3'
[5]:
['#key5', 'value5']
- key: '#key5'
- val: 'value5'
[6]:
['key6', 'v_a_l_u_e_6']
- key: 'key6'
- val: 'v_a_l_u_e_6'
[7]:
['subobject2', [['key1', 'value1']]]
- obj_name: 'subobject2'
- objects: [['key1', 'value1']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
[8]:
['key7', 'value7']
- key: 'key7'
- val: 'value7'
[9]:
['key8', 'null']
- key: 'key8'
- val: 'null'
但是我无法setResultsName
进入小组[0]
索引:
obj << Group(name_expr('obj_name') + objects('objects'))('section')
返回错误的结果。
答案 0 :(得分:1)
这应该可以帮助您。有关详细信息,请参见评论。
from pyparsing import *
test_string ='''
object1 {
key1 = value1
key2
#key3 = value3
key4
#key5 = value5
key6 = value6
subobject1 {
key1 = value1
key2 = value2
key3 = value3
}
}'''
# interpret inline 'string' as Suppress('string'),
# instead of LBRACE,EQ,RBRACE,HASH = map(Suppress, '{=}#')
ParserElement.inlineLiteralsUsing(Suppress)
# be sure to exclude special characters when using printables
name_expr = Word(printables, excludeChars='{}')
key_val_expr = '=' + Word(printables)
# p1('name') is equivalent to p1.setResultsName('name')
# p1 | p2 is equivalent to MatchFirst(p1, p2)
# if lineEnd() matches first, there is no value.
# then use a parse action to return the string 'NONE' as value instead
# else, match a regular key_value
# also, you have to use Group because key_val_line is a repeating element
key_val_line = Group(name_expr('key') + (lineEnd().setParseAction(lambda t: 'NONE') | key_val_expr)('val'))
key_val_lines = OneOrMore(key_val_line)('key_val_lines')
obj = Forward()
obj << Group(name_expr('obj_name') + '{' + OneOrMore(key_val_lines | obj) + '}')('objects')
parse_results = obj.parseString(test_string)
print(parse_results.dump())
这将打印以下内容:
[['object1', ['key1', 'value1'], ['key2', 'NONE'], ['#key3', 'value3'], ['key4', 'NONE'], ['#key5', 'value5'], ['key6', 'value6'], ['subobject1', ['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]]]
- objects: ['object1', ['key1', 'value1'], ['key2', 'NONE'], ['#key3', 'value3'], ['key4', 'NONE'], ['#key5', 'value5'], ['key6', 'value6'], ['subobject1', ['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]]
- key_val_lines: [['key1', 'value1'], ['key2', 'NONE'], ['#key3', 'value3'], ['key4', 'NONE'], ['#key5', 'value5'], ['key6', 'value6']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
[1]:
['key2', 'NONE']
- key: 'key2'
- val: 'NONE'
[2]:
['#key3', 'value3']
- key: '#key3'
- val: 'value3'
[3]:
['key4', 'NONE']
- key: 'key4'
- val: 'NONE'
[4]:
['#key5', 'value5']
- key: '#key5'
- val: 'value5'
[5]:
['key6', 'value6']
- key: 'key6'
- val: 'value6'
- obj_name: 'object1'
- objects: ['subobject1', ['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]
- key_val_lines: [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]
[0]:
['key1', 'value1']
- key: 'key1'
- val: 'value1'
[1]:
['key2', 'value2']
- key: 'key2'
- val: 'value2'
[2]:
['key3', 'value3']
- key: 'key3'
- val: 'value3'
- obj_name: 'subobject1'
答案 1 :(得分:0)
@Jeppi的答案有一些很棒的建议。我会补充:
Word(printables)
总是有风险的构造,因为它将匹配尽可能多的非空白。例如,如果一行包含“ color = red”且没有空格,则该行将被解释为键“ color = red”而没有值。您最好使用Word(alphanums)
或Word(alphas, alphanums+"_")
之类的定义键。要允许可能的前导“#”,请使用Word(alphas+'#', alphanums+"_")
。
您关于使用if LineStart() == HASH
条件化'#'的想法很有趣,但是pyparsing的工作原理却不是。在代码的这一点上,您仍在构建解析器本身,该解析器与任何输入文本分开进行。特定行是否以“#”开头的实际确定是在解析过程中进行的,稍后在您的代码调用collection.parseString
时完成。也就是说,您将建立所有解析器位,然后将它们指向源文本。任何“如果存在字符X”逻辑都需要在解析器本身中使用某种替换或可选结构表示,而不是使用Python if-then代码。
考虑将pyparsing的Optional类用于可能存在或可能不存在的元素。这适用于可能没有值的key-value
,也可能是处理键名中可能的前导'#'字符的另一种方法。
答案 2 :(得分:0)
递归解析器不是pyparsing的简单开始,而您的可选位也使事情变得更加复杂。我认为这段代码基本上可以满足您的要求-到目前为止,您已经完成了与pyparsing的搏斗,希望对您来说更有意义:
import pyparsing as pp
LBRACE, RBRACE, EQ = map(pp.Suppress, "{}=")
# convert parsed '#' to a bool that you can test on
disabled_marker = pp.Literal("#").addParseAction(lambda: True)
identifier = pp.pyparsing_common.identifier
key = identifier()
# try to parse a numeric value first, might be interesting
# pyparsing_common.number will auto-convert string to float or int at parse time,
# so you won't have to detect and do the conversion later
value = pp.pyparsing_common.number | pp.Word(pp.printables)
obj_item = pp.Forward()
obj_expr = pp.Group(identifier("name")
+ pp.Group(LBRACE
+ pp.ZeroOrMore(obj_item)
+ RBRACE)("attributes"))
key_with_value = pp.Group(pp.Optional(disabled_marker)("disabled")
+ key("key") + EQ + value("value"))
# use empty() to inject a None for the value
key_without_value = pp.Group(pp.Optional(disabled_marker)("disabled")
+ key("key")
+ pp.empty().addParseAction(lambda: [None])("value"))
# now define an item that can be used in an object - this order is important!
obj_item <<= obj_expr | key_with_value | key_without_value
要解析您的string2
输入:
zz = obj_expr.parseString(string2)
print(zz[0].dump())
礼物:
['object1', [['key1', 'value1'], ['key2', None], [True, 'key3', 'value3'], ['key4', 'value4'], ['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]], [True, 'key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', [['key1', 'value1']]], ['key7', 'value7'], ['key8', None]]]
- attributes: [['key1', 'value1'], ['key2', None], [True, 'key3', 'value3'], ['key4', 'value4'], ['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]], [True, 'key5', 'value5'], ['key6', 'v_a_l_u_e_6'], ['subobject2', [['key1', 'value1']]], ['key7', 'value7'], ['key8', None]]
[0]:
['key1', 'value1']
- key: 'key1'
- value: 'value1'
[1]:
['key2', None]
- key: 'key2'
- value: None
[2]:
[True, 'key3', 'value3']
- disabled: True
- key: 'key3'
- value: 'value3'
[3]:
['key4', 'value4']
- key: 'key4'
- value: 'value4'
[4]:
['subobject1', [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]]
- attributes: [['key1', 'value1'], ['key2', 'value2'], ['key3', 'value3']]
[0]:
['key1', 'value1']
- key: 'key1'
- value: 'value1'
[1]:
['key2', 'value2']
- key: 'key2'
- value: 'value2'
[2]:
['key3', 'value3']
- key: 'key3'
- value: 'value3'
- name: 'subobject1'
[5]:
[True, 'key5', 'value5']
- disabled: True
- key: 'key5'
- value: 'value5'
[6]:
['key6', 'v_a_l_u_e_6']
- key: 'key6'
- value: 'v_a_l_u_e_6'
[7]:
['subobject2', [['key1', 'value1']]]
- attributes: [['key1', 'value1']]
[0]:
['key1', 'value1']
- key: 'key1'
- value: 'value1'
- name: 'subobject2'
[8]:
['key7', 'value7']
- key: 'key7'
- value: 'value7'
[9]:
['key8', None]
- key: 'key8'
- value: None
- name: 'object1'
编辑:我删除了Dict构造,因为它们实际上使输出更难以处理。