嵌套dicts的差异而不转换为集合

时间:2015-06-06 19:57:03

标签: python json recursion diff

在未来几个月的事情中,这将是我的关注点。 这是一个非常有趣的挑战。可以在许多上下文中用于传递大量json数据的系统。

我使用每种方法的预期输出进行了单元测试。 如果有人对一个好的算法或技术有任何建议来解决这个问题, 仅此一点就会有很大帮助。

或者如果你有野心勃勃,你可以为我解决: - )

class Diff():
    """
    Show the difference between two given dicts 'old' and 'new'.

    Intended to be used in an auditing system that checks existing
    records in a nosql database ('old') and compares them with a
    request to update the record ('new') and store the difference
    in a separate database.

    See test_diff.py for expected output for each method.

    Will need recursive methods for nested dicts

    Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}

    Must not convert dicts to sets. Because of the billions of records that will be compared,
    conversion to sets, then back to dicts would drastically slow down the process.
    """

    def added(self, old, new):
        pass

    def modified(self, old, new):
        pass

    def deleted(self, old, new):
        pass

    def difference(self, old, new):
        pass

    def has_key(self, key):
        """explicitly defined for python 2/3 compatability"""
        try:
            self[key]
        except KeyError:
            return False
        return True

这是单元测试

import unittest
from diff import Diff

class TestDiff(unittest.TestCase):

    def setUp(self):
        self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
        self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
        self.d = Diff()

    def test_added_logic(self):
        result = self.d.added(self.old_dict, self.new_dict)
        expected = {"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}
        self.assertEqual(result, expected)

    def test_modified_logic(self):
        result = self.d.modified(self.old_dict, self.new_dict)
        expected = {"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}
        self.assertEqual(result, expected)

    def test_deleted_logic(self):
        result = self.d.deleted(self.old_dict, self.new_dict)
        expected = {"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
        self.assertEqual(result, expected)

    def test_difference_logic(self):
        result = self.d.difference(self.old_dict, self.new_dict)
        expected = {"results":[
                        {"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
                        {"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
                        {"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
            ]
        }
        self.assertEqual(result, expected)

if __name__ == "__main__":
    unittest.main()

3 个答案:

答案 0 :(得分:0)

我对它进行了刺穿并通过了单元测试。在Python 2.7中。但是我将单元测试期望更改为更加统一,现在它总是想要一个dict对象列表。否则前三个测试对多个差异没有意义,最后一个测试不一致。

欢迎提出改进意见......

class Diff():
    """
    Show the difference between two given dicts 'old' and 'new'.

    Intended to be used in an auditing system that checks existing
    records in a nosql database ('old') and compares them with a
    request to update the record ('new') and store the difference
    in a separate database.

    See test_diff.py for expected output for each method.

    Will need recursive methods for nested dicts

    Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}

    Must not convert dicts to sets. Because of the billions of records that will be compared,
    conversion to sets, then back to dicts would drastically slow down the process.
    """

    def __init__(self):
        self.depth = []
        self.results = []

    def added(self, old, new):
        for key in new:
            if key not in old:
                self.results.append({
                    'operation': 'ADDED',
                    'field': '.'.join(self.depth) + '.' + str(key),
                    'new': new[key]
                })
            elif type(old[key]) == dict and type(new[key]) == dict:
                self.depth.append(str(key))
                self.added(old[key], new[key])

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def modified(self, old, new):
        for key in [key for key in new if key in old]:
            if type(old[key]) == dict and type(new[key]) == dict:
                self.depth.append(str(key))
                self.modified(old[key], new[key])
            elif old[key] != new[key]:
                self.results.append({
                    'operation': 'MODIFIED',
                    'field': '.'.join(self.depth) + '.' + str(key),
                    'old': old[key],
                    'new': new[key]
                })

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def deleted(self, old, new):
        for key in old:
            if key not in new:
                self.results.append({
                    'operation': 'DELETED',
                    'field': '.'.join(self.depth) + '.' + str(key),
                    'old': old[key]
                })
            elif type(old[key]) == dict and type(new[key]) == dict:
                self.depth.append(str(key))
                self.deleted(old[key], new[key])

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def difference(self, old, new):
        self.added(old, new)
        self.modified(old, new)
        self.deleted(old, new)
        return self.results

import unittest
from diff import Diff

class TestDiff(unittest.TestCase):

    def setUp(self):
        self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
        self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
        self.d = Diff()

    def test_added_logic(self):
        result = self.d.added(self.old_dict, self.new_dict)
        expected = [{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}]
        self.assertEqual(result, expected)

    def test_modified_logic(self):
        result = self.d.modified(self.old_dict, self.new_dict)
        expected = [{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}]
        self.assertEqual(result, expected)

    def test_deleted_logic(self):
        result = self.d.deleted(self.old_dict, self.new_dict)
        expected = [{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}]
        self.assertEqual(result, expected)

    def test_difference_logic(self):
        result = self.d.difference(self.old_dict, self.new_dict)
        expected = [
            {"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
            {"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
            {"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
        ]
        self.assertEqual(result, expected)

if __name__ == "__main__":
    unittest.main()

答案 1 :(得分:0)

感谢mike.k解决此问题 我已经修改了一些代码。 我添加了平坦的,一级深度的测试。 我还修复了平坦的dicts深度之前的点。 最后,因为输出将转换为json 为了存储在数据库中,我又添加了一个方法 将最终差异转换为字典。

class Diff():
    """
    Show the difference between two given dicts 'old' and 'new'.

    Intended to be used in an auditing system that checks existing
    records in a nosql database ('old') and compares them with a
    request to update the record ('new') and store the difference
    in a separate database.

    See test_diff.py for expected output for each method.

    Will need recursive methods for nested dicts

    Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}

    Must not convert dicts to sets. Because of the billions of records that will be compared,
    conversion to sets, then back to dicts would drastically slow down the process.
    """

    def __init__(self):
        self.depth = []
        self.results = []

    def added(self, old, new):
        for key in new:
            if key not in old:
                if self.depth:
                    self.results.append({
                        'operation': 'ADDED',
                        'field': '.'.join(self.depth) + '.' + str(key),
                        'new': new[key]
                    })
                else:
                    self.results.append({
                        'operation': 'ADDED',
                        'field': str(key),
                        'new': new[key]
                    })
            else:
                if type(old[key]) == dict and type(new[key]) == dict:
                    self.depth.append(str(key))
                    self.added(old[key], new[key])

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def modified(self, old, new):
        for key in [key for key in new if key in old]:
            if type(old[key]) == dict and type(new[key]) == dict:
                self.depth.append(str(key))
                self.modified(old[key], new[key])

            elif old[key] != new[key]:
                if self.depth:
                    self.results.append({
                        'operation': 'MODIFIED',
                        'field': '.'.join(self.depth) + '.' + str(key),
                        'old': old[key],
                        'new': new[key]
                    })
                else:
                    self.results.append({
                        'operation': 'MODIFIED',
                        'field': str(key),
                        'old': old[key],
                        'new': new[key]
                    })

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def deleted(self, old, new):
        for key in old:
            if key not in new:
                if self.depth:
                    self.results.append({
                        'operation': 'DELETED',
                        'field': '.'.join(self.depth) + '.' + str(key),
                        'old': old[key]
                    })
                else:
                    self.results.append({
                        'operation': 'DELETED',
                        'field': str(key),
                        'old': old[key]
                    })
            else:
                if type(old[key]) == dict and type(new[key]) == dict:
                    self.depth.append(str(key))
                    self.deleted(old[key], new[key])

        if self.depth:
            self.depth.pop()
        else:
            return self.results

    def combine_results(self, old, new):
        self.added(old, new)
        self.modified(old, new)
        self.deleted(old, new)
        return self.results

    def difference(self, old, new):
        d = {}
        d['difference'] = self.combine_results(old, new)
        return d

这是unittest

import unittest
from diff import Diff

class TestDiff(unittest.TestCase):

    def setUp(self):
        self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
        self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
        self.old_flat = {"a":"a", "z":"z"}
        self.new_flat = {"a":"f", "b":"b"}

        self.d = Diff()

    def test_added_logic(self):
        result = self.d.added(self.old_dict, self.new_dict)
        expected = [{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}]
        self.assertEqual(result, expected)

    def test_added_flat_dict(self):
        result = self.d.added(self.old_flat, self.new_flat)
        expected = [{'field': 'b', 'operation': 'ADDED', 'new': 'b'}]
        self.assertEqual(result, expected)

    def test_modified_logic(self):
        result = self.d.modified(self.old_dict, self.new_dict)
        expected = [{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}]
        self.assertEqual(result, expected)

    def test_modified_flat_dict(self):
        result = self.d.modified(self.old_flat, self.new_flat)
        expected = [{'field': 'a', 'operation': 'MODIFIED', 'new': 'f', 'old': 'a'}]
        self.assertEqual(result, expected)

    def test_deleted_logic(self):
        result = self.d.deleted(self.old_dict, self.new_dict)
        expected = [{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}]
        self.assertEqual(result, expected)

    def test_deleted_flat_dict(self):
        result = self.d.deleted(self.old_flat, self.new_flat)
        expected = [{'field': 'z', 'operation': 'DELETED', 'old': 'z'}]
        self.assertEqual(result, expected)

    def test_difference_logic(self):
        result = self.d.combine_results(self.old_dict, self.new_dict)
        expected = [
            {"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
            {"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
            {"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
        ]
        self.assertEqual(result, expected)

if __name__ == "__main__":
    unittest.main()

答案 2 :(得分:0)

基于@ mike-k回答我做了一个不需要新类的方法

实施

def dict_diff(dict1: dict, dict2: dict, operation='modified', keys=None):
    """
    method to find the difference between (nested) dicts

    :param dict1: first dict to compare
    :param dict2: second dict to compare
    :param method: in ('added', 'deleted', 'modified')
        'added' checks for keys in `dict2` which do not occur in `dict1`
        'deleted' checks for keys in `dict1` which do not occur in `dict2`
        'modified' checks for all changes
    :param keys: this method uses recursion to check in nested dicts. `keys` is a list of the levels in the nesting
        does not need to be specified by the used
    :return: a sorted list of the differences between `dict1` and `dict2` in the form of a list of namedtuples
        namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
            `key` is a list of the keys of the changed element
            `operation` is  in ('added', 'deleted', 'modified')
    """
    acceptable_methods = ('added', 'deleted', 'modified')
    if operation not in acceptable_methods:
        raise ValueError('parameter `operation` should be on of %s' % str(acceptable_methods))
    dict_diff_result = namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
    if keys is None:
        keys = list()
    diffs = list()
    dict1_keyset = set(dict1)
    dict2_keyset = set(dict2)
    in_dict1_and_dict2 = dict1_keyset & dict2_keyset
    in_dict1_but_not_dict2 = sorted(dict1_keyset - in_dict1_and_dict2)
    in_dict2_but_not_dict1 = sorted(dict2_keyset - in_dict1_and_dict2)

    for key in sorted(in_dict1_and_dict2):
        key_list = keys + [key]

        dict1_val = dict1[key]
        dict2_val = dict2[key]
        if isinstance(dict1_val, dict) and isinstance(dict2_val, dict):
            results = dict_diff(dict1_val, dict2_val, keys=key_list, operation=operation)
            diffs += results
        elif dict1_val != dict2_val and operation == 'modified':
            diffs.append(dict_diff_result(key_list, dict1_val, dict2_val, 'modified'))
    if in_dict1_but_not_dict2 and operation in ('deleted', 'modified',):
        diffs += [dict_diff_result(keys + [key], dict1[key], None, 'deleted') for key in in_dict1_but_not_dict2]
    if in_dict2_but_not_dict1 and operation in ('added', 'modified',):
        diffs += [dict_diff_result(keys + [key], None, dict2[key], 'added') for key in in_dict2_but_not_dict1]

    return diffs

如果您希望获得更多性能,但无法进行排序,则可以删除sorted()或使用yield而不是return来实现此方法,以获得generator而不是list

文档测试

"""
    >>> dict1 = {"key1":{"key2": {'a': 1, "x":"A","z":"d"}}, 'key3': 1, }
    >>> dict2 = {"key1":{"key2": {'a': 1, "x":"C","y":"B"}}, 'key4': 1, }

    >>> dict_diff(dict1, dict2) # doctest: +NORMALIZE_WHITESPACE
    [DictDiffResult(key=['key1', 'key2', 'x'], dict1_value='A', dict2_value='C', operation='modified'),
    DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
    DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
    DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted'),
    DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]

    >>> dict_diff(dict1, dict1) # doctest: +NORMALIZE_WHITESPACE
    []

    >>> dict_diff(dict1, dict2, operation='modified') # doctest: +NORMALIZE_WHITESPACE
    [DictDiffResult(key=['key1', 'key2', 'x'], dict1_value='A', dict2_value='C', operation='modified'),
    DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
    DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
    DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted'),
    DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]

    >>> dict_diff(dict1, dict2, operation='deleted') # doctest: +NORMALIZE_WHITESPACE
    [DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
    DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted')]

    >>> dict_diff(dict1, dict2, operation='added') # doctest: +NORMALIZE_WHITESPACE
    [DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
    DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]

    >>> dict_diff(dict1, dict2, operation='wrong') # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
    Traceback (most recent call last):
    ...
    ValueError: parameter `operation` should be on of ('added', 'deleted', 'modified')
"""

没有set

的实施

如果您确实需要没有set的实施,则可以执行以下操作

def dict_diff(dict1: dict, dict2: dict, operation='modified', keys=None):
    acceptable_methods = ('added', 'deleted', 'modified')
    if operation not in acceptable_methods:
        raise ValueError('parameter `operation` should be on of %s' % str(acceptable_methods))
    dict_diff_result = namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
    if keys is None:
        keys = list()
    diffs = list()

    for key in dict1:
        key_list = keys + [key]
        if key not in dict2:
            if operation in ('deleted', 'modified',):
                diffs.append(dict_diff_result(keys + [key], dict1[key], None, 'deleted'))
        else:
            dict1_val = dict1[key]
            dict2_val = dict2[key]
            if isinstance(dict1_val, dict) and isinstance(dict2_val, dict):
                results = dict_diff(dict1_val, dict2_val, keys=key_list, operation=operation)
                diffs += results
            elif dict1_val != dict2_val and operation == 'modified':
                diffs.append(dict_diff_result(key_list, dict1_val, dict2_val, 'modified'))
    for key in dict2:
        if key not in dict1:
            if operation in ('added', 'modified',):
                diffs.append(dict_diff_result(keys + [key], None, dict2[key], 'added'))

    return diffs

由于这会迭代整个dict2,对于具有大量相似值的长dicts来说,这可能会更慢

关于第二种方法的一个小细节是它不再排序,但可以很容易地改变