Question

我有一个JSON对象，其中可能包含重复的项目和位置，并且我希望保持最高的风险（并且只有其中之一）

[{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'Low'
#Other values are omitted
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'High'
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'Moderate'
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'High'
},
{
'item': 'itemtwo',
'location': 'locationtwo',
'risk_level': 'Low'
}]

我尝试将其转换为熊猫数据框，基于risk_level排序并使用drop_duplicates，但这会导致JSON中的其他值出现问题（例如，将None转换为NaN，将int转换为float等），所以我不这样做认为可行。

    #Convert to dataframe and drop identical insights with lowest severities
    dfInsights = pd.DataFrame(response['data'])
    dfInsights = dfInsights.reindex(columns=list(response['data'][0].keys()))
    dfInsights.sort_values(['risk_level'], inplace=True)
    dfInsights.drop_duplicates(['item','location'], keep='first', inplace=True)
    dfToJSON = dfInsights.to_dict(orient='records')

我希望结果是：

[{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'High'
},
{
'item': 'itemtwo',
'location': 'locationtwo',
'risk_level': 'Low'
}]

Answer 1

您可以使用TO_TIMESTAMP('18/07/2019 09:30:00', 'dd/mm/yyyy HH24:MI:SS')和基于权重的自定义键功能：

itertools.groupby

打印：

d = [{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'Low'
#Other values are omitted
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'High'
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'Moderate'
},
{
'item': 'itemone',
'location': 'locationone',
'risk_level': 'High'
},
{
'item': 'itemtwo',
'location': 'locationtwo',
'risk_level': 'Low'
}]

from itertools import groupby
from operator import itemgetter

f = itemgetter('item', 'location')
weights = {'Low':2, 'Moderate':1, 'High':0}

out = []
for v, g in groupby(sorted(d, key=lambda k: (f(k), weights[k['risk_level']])), key=f):
    out.append(next(g))

from pprint import pprint
pprint(out, width=30)

Answer 2

在蒂莫西的帮助下，以下是解决方案：

import unittest

class TestRemoveDuplicates(unittest.TestCase):
    def setUp(self):
        pass

    def filter_dups(self, curr_doc, filtered_docs):
        for docs in filtered_docs:
            if (curr_doc['item'] == docs['item'] and curr_doc['location'] == docs['location']):
                if (curr_doc['risk_level'] <= (docs['risk_level'])):
                    return False
        return True

    def test_json(self):
        levels = [None, 'Low', 'Moderate', 'High', 'Critical']

        test_json = [
                    {
                        'item': 'itemone',
                        'location': 'locationone',
                        'risk_level': 'Low'
                        #Other values are omitted
                    },
                    {
                        'item': 'itemone',
                        'location': 'locationone',
                        'risk_level': 'High'
                    },
                    {
                        'item': 'itemone',
                        'location': 'locationone',
                        'risk_level': 'Moderate'
                    },
                    {
                        'item': 'itemone',
                        'location': 'locationone',
                        'risk_level': 'High'
                    },
                    {
                        'item': 'itemtwo',
                        'location': 'locationtwo',
                        'risk_level': 'Low'
                    }
                    ]

        risk_conv_json = []

        for docs in test_json:
            docs['risk_level'] = levels.index(docs['risk_level'])
            risk_conv_json.append(docs)

        sorted_json = (sorted(risk_conv_json, key=lambda x : x['risk_level'], reverse=True))

        filtered_json = []

        for curr_sorted_doc in sorted_json:
            if self.filter_dups(curr_sorted_doc, filtered_json):
                filtered_json.append(curr_sorted_doc)

        output_json = []

        for docs in filtered_json:
            docs['risk_level'] = levels[docs['risk_level']]
            output_json.append(docs)

        print(output_json)

    def tearDown(self):
        pass

Python-根据JSON中的值从JSON中删除重复的元素

2 个答案: