我试图解析看起来像这样的文件:
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
其中:
理想的结果是这样的:
{
"MSH": {
"name": "placeholder",
"opt": false,
"rep": false,
"description": "Plain Segment"
},
"PD1": {
"name": "placeholder",
"opt": true,
"rep": false,
"description": "Optional Segment"
},
// some segments here
"group": {
"opt": true,
"rep": false,
"description": "Optionals group placeholder text",
"segment0": {
"ROL": {
"name": "placeholder",
"opt": false,
"rep": true,
"description": "Repeating Segment"
}
}
}
}
我已经阅读了SO和Pyparsing wiki上的大多数pyparsing帖子,包括fourFn.py示例和regexinverter。我相信我需要使用Infixnotation
,但我不太了解如何使用它。
这是我到目前为止所做的:
lbrack = pp.Literal("[")
rbrack = pp.Literal("]")
lbrace = pp.Literal("{")
rbrace = pp.Literal("}")
segment = pp.Word(pp.alphanums,exact=3)
optsegment = lbrack + segment + rbrack
repsegment = lbrace + segment + rbrace
optrepsegment = lbrack + lbrace + segment + rbrace + rbrack
segments = (segment.setResultsName("RawSegment") |
optsegment.setResultsName("OptionalSegment") |
repsegment.setResultsName("RepeatingSegment") |
optrepsegment.setResultsName("OptionalRepeatingSegment"))
opt_group = pp.Group(lbrack + segments + rbrack)
rep_group = pp.Group(lbrace + segments + rbrace)
message = pp.Group(segments | opt_group | rep_group)
expr = pp.infixNotation(message,
[
('[', 2, pp.opAssoc.LEFT),
('{', 2, pp.opAssoc.LEFT),
('}', 1, pp.opAssoc.RIGHT),
(']', 1, pp.opAssoc.RIGHT),
])
msg = message.searchString(data)
for item in msg:
print(item)
我还没有敲定输出格式,我只是试图在此时正确解析输出。
答案 0 :(得分:2)
以下是lark的代码:
import json
import lark
l = lark.Lark("""
start: _segment
SIMPLE_SEGMENT: ("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)("_"|LETTER|DIGIT)
o_segment: "["_segment"]"
r_segment: "{"_segment"}"
_segment: (SIMPLE_SEGMENT|o_segment|r_segment)+
%import common.LETTER
%import common.DIGIT
%import common.WS
%ignore WS
""", parser='lalr') # using lalr as parser is better than the default parser
class TreeTransformer(lark.Transformer):
@staticmethod
def o_segment(content):
if len(content) == 1 and isinstance(content[0], tuple) and content[0][0] == 'rep':
return "rep_opt", content[0][1]
return "opt", tuple(content) if len(content) != 1 else content[0]
@staticmethod
def r_segment(content):
return "rep", tuple(content) if len(content) != 1 else content[0]
def start(self, content):
out = []
for token in content:
if isinstance(token, str):
out.append({"name": "placeholder",
"opt": False,
"rep": False,
"description": "Plain Segment",
"token_name": token})
else:
if isinstance(token[1], str):
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Segment",
"token_name": token[1]})
else:
opt = 'opt' in token[0]
rep = 'rep' in token[0]
out.append({"name": "placeholder",
"opt": opt,
"rep": rep,
"description": ("Optional " if opt else '') + ("Repeating " if opt else '') + "Group",
"segments": self.start(token[1])})
return out
transformer = TreeTransformer()
tree = l.parse("""
MSH
[ PD1 ]
[{ ROL }]
[
{ ROL }
]
[
{
PR1
[{ ROL }]
}
]
[
{
IN1
[ IN2 ]
[{ IN3 }]
}
]
[ ACC ]
""")
print(json.dumps(transformer.transform(tree), indent=4))