假设我们有以下JSON文件。为了示例起见,它是由字符串模拟的。字符串是输入,Tree
对象应该是输出。我将使用树的图形表示法来呈现输出。
我发现以下类可以处理Python中的树概念:
class TreeNode(object):
def __init__(self, data):
self.data = data
self.children = []
def add_child(self, obj):
self.children.append(obj)
def __str__(self, level=0):
ret = "\t"*level+repr(self.data)+"\n"
for child in self.children:
ret += child.__str__(level+1)
return ret
def __repr__(self):
return '<tree node representation>'
class Tree:
def __init__(self):
self.root = TreeNode('ROOT')
def __str__(self):
return self.root.__str__()
输入文件的复杂程度可能不同:
输入:
json_file = '{"item1": "end1", "item2": "end2"}'
输出:
"ROOT"
item1
end1
item2
end2
输入:
json_file = {"item1": "end1", "item2": {"item3": "end3"}}
输出:
"ROOT"
item1
end1
item2
item3
end3
输入:
json_file = { "name": "John", "items": [ { "item_name": "lettuce", "price": 2.65, "units": "no" }, { "item_name": "ketchup", "price": 1.51, "units": "litres" } ] }
输出:
"ROOT"
name
John
items
1
item_name
lettuce
price
2.65
units
no
2
item_name
ketchup
price
1.51
units
litres
请注意,数组中的每个项目均以整数(从1开始)描述。
到目前为止,我已经设法提出了以下功能,可以解决简单情况下的问题。就嵌入式情况而言,我知道必须使用递归,但到目前为止,我得到了UnboundLocalError: local variable 'tree' referenced before assignment
。
def create_tree_from_JSON(json, parent=None):
if not parent:
tree = Tree()
node_0 = TreeNode("ROOT")
tree.root = node_0
parent = node_0
else:
parent = parent
for key in json:
if isinstance(json[key], dict):
head = TreeNode(key)
create_tree_from_JSON(json[key], head)
else:
node = TreeNode(key)
node.add_child(TreeNode(json[key]))
parent.add_child(node)
return tree
您可能想知道为什么我需要将JSON对象更改为树。您可能知道PostgreSQL提供了一种处理数据库中JSON字段的方法。给定一个JSON对象,我可以使用->
和->>
表示法获取任何字段的值。 Here和here有关此主题的更多信息。我将基于字段的名称和值创建新表。不幸的是,JSON对象的差异程度使得我无法手动编写.sql
代码-我必须找到一种自动执行此方法的方法。
让我们假设我要基于嵌入式案例创建一个表。我需要获取以下.sql
代码:
select
content_json ->> 'item1' as end1,
content_json -> 'item_2' ->> 'item_3' as end3
from table_with_json
用content_json
替换为"ROOT"
,您会发现SQL代码中的每一行只是从“ ROOT”到叶的深度优先遍历(始终从最后一个节点到叶的注释->>
)。
编辑:为了使问题更清楚,我添加了针对数组大小写的目标.sql
查询。我希望查询与数组中的元素一样多:
select
content_json ->> 'name' as name,
content_json -> 'items' -> 1 -> 'item_name' as item_name,
content_json -> 'items' -> 1 -> 'price' as price,
content_json -> 'items' -> 1 -> 'units' as units
from table_with_json
select
content_json ->> 'name' as name,
content_json -> 'items' -> 2 ->> 'item_name' as item_name,
content_json -> 'items' -> 2 ->> 'price' as price,
content_json -> 'items' -> 2 ->> 'units' as units
from table_with_json
我目前正在测试当前的解决方案:
from collections import OrderedDict
def treeify(data) -> dict:
if isinstance(data, dict): # already have keys, just recurse
return OrderedDict((key, treeify(children)) for key, children in data.items())
elif isinstance(data, list): # make keys from indices
return OrderedDict((idx, treeify(children)) for idx, children in enumerate(data, start=1))
else: # leave node, no recursion
return data
def format_query(tree, stack=('content_json',)) -> str:
if isinstance(tree, dict): # build stack of keys
for key, child in tree.items():
yield from format_query(child, stack + (key,))
else: # print complete stack, discarding leaf data in tree
*keys, field = stack
path = ' -> '.join(
str(key) if isinstance(key, int) else "'%s'" % key
for key in keys
)
yield path + " ->> '%s' as %s" % (field, field)
def create_select_query(lines_list):
query = "select\n"
for line_number in range(len(lines_list)):
if "_class" in lines_list[line_number]:
# ignore '_class' fields
continue
query += "\t" + lines_list[line_number]
if line_number == len(lines_list)-1:
query += "\n"
else:
query += ",\n"
query += "from table_with_json"
return query
我目前正在使用这样的JSON:
stack_nested_example = {"_class":"value_to_be_ignored","first_key":{"second_key":{"user_id":"123456","company_id":"9876","question":{"subject":"some_subject","case_type":"urgent","from_date":{"year":2011,"month":11,"day":11},"to_date":{"year":2012,"month":12,"day":12}},"third_key":[{"role":"driver","weather":"great"},{"role":"father","weather":"rainy"}]}}}
在输出中,我得到的唯一常量元素是使用数组逻辑处理的行的顺序。其他行的顺序不同。我想获得的输出是考虑了按键顺序的输出:
select
'content_json' -> 'first_key' -> 'second_key' ->> 'user_id' as user_id,
'content_json' -> 'first_key' -> 'second_key' ->> 'company_id' as company_id,
'content_json' -> 'first_key' -> 'second_key' -> 'question' ->> 'subject' as subject,
'content_json' -> 'first_key' -> 'second_key' -> 'question' ->> 'case_type' as case_type,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'from_date' ->> 'year' as year,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'from_date' ->> 'month' as month,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'from_date' ->> 'day' as day,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'to_date' ->> 'year' as year,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'to_date' ->> 'month' as month,
'content_json' -> 'first_key' -> 'second_key' -> 'question' -> 'to_date' ->> 'day' as day,
'content_json' -> 'first_key' -> 'second_key' -> 'third_key' -> 1 ->> 'role' as role,
'content_json' -> 'first_key' -> 'second_key' -> 'third_key' -> 1 ->> 'weather' as weather,
'content_json' -> 'first_key' -> 'second_key' -> 'third_key' -> 2 ->> 'role' as role,
'content_json' -> 'first_key' -> 'second_key' -> 'third_key' -> 2 ->> 'weather' as weather
from table_with_json
答案 0 :(得分:1)
您可以使用递归:
def format_query(d):
if all(not isinstance(i, tuple) for i in d):
return 'select\n{}\nfrom table_with_json'.format(',\n'.join('\tcontent_json {}'.format("->> '{}' as {}".format(i[0], i[0]) if len(i) == 1 else "-> {} ->> '{}' as {}".format(' -> '.join("'{}'".format(j) for j in i[:-1]), i[-1], i[-1])) for i in d))
return '\n\n'.join(format_query([c for b in i for c in b]) for i in d)
def get_dict(d, c = []):
for a, b in d.items():
if not isinstance(b, (dict, list)):
yield c+[a]
elif isinstance(b, dict):
yield from to_query(b, c+[a])
def to_query(d, q = []):
if not any(isinstance(i, list) for i in d.values()):
yield from get_dict(d, c=q)
else:
_c = list(get_dict(d))
for a, b in d.items():
if isinstance(b, list):
for i, j in enumerate(b, 1):
yield (_c, list(get_dict(j, [a, i])))
现在,要格式化:
json_file = { "name": "John", "items": [ { "item_name": "lettuce", "price": 2.65, "units": "no" }, { "item_name": "ketchup", "price": 1.51, "units": "litres" } ] }
print(format_query(list(to_query(json_file))))
输出:
select
content_json ->> 'name' as name,
content_json -> 'items' -> '1' ->> 'item_name' as item_name,
content_json -> 'items' -> '1' ->> 'price' as price,
content_json -> 'items' -> '1' ->> 'units' as units
from table_with_json
select
content_json ->> 'name' as name,
content_json -> 'items' -> '2' ->> 'item_name' as item_name,
content_json -> 'items' -> '2' ->> 'price' as price,
content_json -> 'items' -> '2' ->> 'units' as units
from table_with_json
答案 1 :(得分:1)
在您的create_tree_from_JSON
中,您永远不会在递归过程中传递树。但是您尝试将其退回。
def create_tree_from_JSON(json, parent=None):
if not parent:
tree = Tree() # tree is only created for root node
...
else:
parent = parent # tree is not created here
...
return tree # tree is always returned
在递归过程中要么传递tree
,要么将根步与其他步分开:
def create_tree_from_JSON(json): # root case
tree = Tree()
node_0 = TreeNode("ROOT")
tree.root = node_0
parent = node_0
_walk_tree(json, parent)
def _walk_tree(json, parent): # recursive case
for key in json:
if isinstance(json[key], dict):
head = TreeNode(key)
_walk_tree(json[key], head)
else:
node = TreeNode(key)
node.add_child(TreeNode(json[key]))
parent.add_child(node)
请注意,使用简单的dict
可以轻松解决您的工作。您的课程实际上只是围绕着dict
包装了一个自定义界面。
def treeify(data) -> dict:
if isinstance(data, dict): # already have keys, just recurse
return {key: treeify(children) for key, children in data.items()}
elif isinstance(data, list): # make keys from indices
return {idx: treeify(children) for idx, children in enumerate(data, start=1)}
else: # leave node, no recursion
return data
您可以向其提供任何解码的json数据。
>>> treeify(json_file = { "name": "John", "items": [ { "item_name": "lettuce", "price": 2.65, "units": "no" }, { "item_name": "ketchup", "price": 1.51, "units": "litres" } ] })
{'name': 'John', 'items': {1: {'item_name': 'lettuce', 'price': 2.65, 'units': 'no'}, 2: {'item_name': 'ketchup', 'price': 1.51, 'units': 'litres'}}}
要获得所需的精美打印输出,可以使用一堆当前键遍历此结构。生成器适合于动态创建每个查询行:
def format_query(tree, stack=('content_json',)) -> str:
if isinstance(tree, dict): # build stack of keys
for key, child in tree.items():
yield from format_query(child, stack + (key,))
else: # print complete stack, discarding leaf data in tree
*keys, field = stack
path = ' -> '.join(
str(key) if isinstance(key, int) else "'%s'" % key
for key in keys
)
yield path + " ->> '%s' as %s" % (field, field)
给出第二个示例,这使您可以获得查询行列表:
>>> list(format_query(treeify({ "name": "John", "items": [ { "item_name": "lettuce", "price": 2.65, "units": "no" }, { "item_name": "ketchup", "price": 1.51, "units": "litres" } ] })))
["'content_json' ->> 'name' as name",
"'content_json' -> 'items' -> 1 ->> 'item_name' as item_name",
"'content_json' -> 'items' -> 1 ->> 'price' as price",
"'content_json' -> 'items' -> 1 ->> 'units' as units",
"'content_json' -> 'items' -> 2 ->> 'item_name' as item_name",
"'content_json' -> 'items' -> 2 ->> 'price' as price",
"'content_json' -> 'items' -> 2 ->> 'units' as units"]