Question

我正在尝试提出代码来解决python中的这个问题。因此，我有以下对象，如果所有其他键->值均等于links，则尝试合并键if site name & id, metadata (id, title, url, desc) are equal仅。请注意，结构始终是一致的。

{
    "websites": [
        {
            "output": {
                "site": {
                    "id": 1,
                    "name": "microsoft"
                },
                "metadata": [
                    {
                        "id": 1,
                        "title": "microsoft Demo site",
                        "links": "[{'links': [{'resource': ['google', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'}]}]",
                        "url": "https://example.com",
                        "desc": "description goes here"
                    },
                    {
                        "id": 1,
                        "title": "microsoft Demo site",
                          "links": "[{'links': [{'resource': ['google', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}, {'resource': ['youtube', 'wikipedia', 'yahoo'], 'link_status': '0', 'updated': '0'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'}]}]",
                        "url": "https://example.com",
                        "desc": "description goes here"
                    }
                ]
            }
        },
        {
            "output": {
                "site": {
                    "id": 1,
                    "name": "Google"
                },
                "metadata": [
                    {
                        "id": 1,
                        "title": "google Demo site",
                        "links": "[{'links': [{'resource': ['amazon', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}]}]",
                        "url": "https://example.com",
                        "desc": "description goes here"
                    }
                ]
            }
        }
    ]
}

在上面的示例中，我期望以下输出：

{
    "websites": [
        {
            "output": {
                "site": {
                    "id": 1,
                    "name": "microsoft"
                },
                "metadata": [
                    {
                        "id": 1,
                        "title": "microsoft Demo site",
                        "links": "[{'links': [{'resource': ['google', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'},{'resource': ['youtube', 'wikipedia', 'yahoo'], 'link_status': '0', 'updated': '0'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'}]}]",
                        "url": "https://example.com",
                        "desc": "description goes here"
                    }
                ]
            }
        },
        {
            "output": {
                "site": {
                    "id": 1,
                    "name": "Google"
                },
                "metadata": [
                    {
                        "id": 1,
                        "title": "google Demo site",
                        "links": "[{'links': [{'resource': ['amazon', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}]}]",
                        "url": "https://example.com",
                        "desc": "description goes here"
                    }
                ]
            }
        }
    ]
}

此代码根据网站名称合并对象，但不执行上述操作。

x_list = [[parent, list(child)] for parent, child in itertools.groupby(sorted(website_list, key=lambda x: x['name']),  key=lambda x: x['name'])]
results = [{'name': parent, 'metadata': [c for j in child for c in j['metadata']]} for parent, child in x_list]
final_result = [{**i, 'metadata': [c for website_list, c in enumerate(i['metadata']) if all(parent != c for parent in i['metadata'][:website_list])]} for i in results]
return final_result

Answer 1

您可以使用itertools.groupby：

from itertools import groupby
import ast, json
headers = ['id', 'title', 'url', 'desc']
def _key(d):
  return [d.get(i) for i in headers]

def get_links(b):
   _c= [c['links'] for i in b for c in ast.literal_eval(i['links'])]
   return json.dumps([{'links':[i for b in _c for i in b]}])

def merge(d):
  new_d = [[a, list(b)] for a, b in groupby(sorted(d, key=_key), key=_key)]
  return [{**dict(zip(headers, a)), 'links':get_links(b)} for a, b in new_d]

result = {'websites':[{'output':{**i['output'], 'metadata':merge(i['output']['metadata'])}} for i in data['websites']]}
print(json.dumps(result, indent=4))

输出：

{
  "websites": [
    {
        "output": {
            "site": {
                "id": 1,
                "name": "microsoft"
            },
            "metadata": [
                {
                    "id": 1,
                    "title": "microsoft Demo site",
                    "url": "https://example.com",
                    "desc": "description goes here",
                    "links": "[{\"links\": [{\"resource\": [\"google\", \"twitter\", \"facebook\"], \"link_status\": \"1\", \"updated\": \"1\"}, {\"resource\": [\"linkedin\", \"box\", \"microsoft\"], \"link_status\": \"0\", \"updated\": \"1\"}, {\"resource\": [\"google\", \"twitter\", \"facebook\"], \"link_status\": \"1\", \"updated\": \"1\"}, {\"resource\": [\"youtube\", \"wikipedia\", \"yahoo\"], \"link_status\": \"0\", \"updated\": \"0\"}, {\"resource\": [\"linkedin\", \"box\", \"microsoft\"], \"link_status\": \"0\", \"updated\": \"1\"}]}]"
                }
            ]
        }
    },
    {
        "output": {
            "site": {
                "id": 1,
                "name": "Google"
            },
            "metadata": [
                {
                    "id": 1,
                    "title": "google Demo site",
                    "url": "https://example.com",
                    "desc": "description goes here",
                    "links": "[{\"links\": [{\"resource\": [\"amazon\", \"twitter\", \"facebook\"], \"link_status\": \"1\", \"updated\": \"1\"}]}]"
                }
            ]
        }
      }
  ]
}

没有json.dumps的输出：

{'websites': [{'output': {'site': {'id': 1, 'name': 'microsoft'}, 'metadata': [{'id': 1, 'title': 'microsoft Demo site', 'url': 'https://example.com', 'desc': 'description goes here', 'links': '[{"links": [{"resource": ["google", "twitter", "facebook"], "link_status": "1", "updated": "1"}, {"resource": ["linkedin", "box", "microsoft"], "link_status": "0", "updated": "1"}, {"resource": ["google", "twitter", "facebook"], "link_status": "1", "updated": "1"}, {"resource": ["youtube", "wikipedia", "yahoo"], "link_status": "0", "updated": "0"}, {"resource": ["linkedin", "box", "microsoft"], "link_status": "0", "updated": "1"}]}]'}]}}, {'output': {'site': {'id': 1, 'name': 'Google'}, 'metadata': [{'id': 1, 'title': 'google Demo site', 'url': 'https://example.com', 'desc': 'description goes here', 'links': '[{"links": [{"resource": ["amazon", "twitter", "facebook"], "link_status": "1", "updated": "1"}]}]'}]}}]}

Answer 2

我已经尝试过了。纠正我，如果我错了：）

import json
data =""" {
        "websites": [
            {
                "output": {
                    "site": {
                        "id": 1,
                        "name": "microsoft"
                    },
                    "metadata": [
                        {
                            "id": 1,
                            "title": "microsoft Demo site",
                            "links": "[{'links': [{'resource': ['google', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'}]}]",
                            "url": "https://example.com",
                            "desc": "description goes here"
                        },
                        {
                            "id": 1,
                            "title": "microsoft Demo site",
                              "links": "[{'links': [{'resource': ['google', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}, {'resource': ['youtube', 'wikipedia', 'yahoo'], 'link_status': '0', 'updated': '0'}, {'resource': ['linkedin', 'box', 'microsoft'], 'link_status': '0', 'updated': '1'}]}]",
                            "url": "https://example.com",
                            "desc": "description goes here"
                        }
                    ]
                }
            },
            {
                "output": {
                    "site": {
                        "id": 1,
                        "name": "Google"
                    },
                    "metadata": [
                        {
                            "id": 1,
                            "title": "google Demo site",
                            "links": "[{'links': [{'resource': ['amazon', 'twitter', 'facebook'], 'link_status': '1', 'updated': '1'}]}]",
                            "url": "https://example.com",
                            "desc": "description goes here"
                        }
                    ]
                }
            }
        ]
    }"""

加载数据后，执行此操作的主要代码是::

json_data = json.loads(data)
metadata_att = json_data["websites"]
for data in metadata_att:
    output = data['output']
    for meta in output:
        metadata = output['metadata']
        for i,name_test1 in enumerate(metadata):
            for name_test2 in metadata[i+1:]:
                if name_test1['title'] == name_test2['title']:
                    del metadata[0]
    print("---------------")
print(json_data)

希望您找到了想要的东西...

根据特定键合并字典列表

2 个答案: