在未来几个月的事情中,这将是我的关注点。 这是一个非常有趣的挑战。可以在许多上下文中用于传递大量json数据的系统。
我使用每种方法的预期输出进行了单元测试。 如果有人对一个好的算法或技术有任何建议来解决这个问题, 仅此一点就会有很大帮助。
或者如果你有野心勃勃,你可以为我解决: - )
class Diff():
"""
Show the difference between two given dicts 'old' and 'new'.
Intended to be used in an auditing system that checks existing
records in a nosql database ('old') and compares them with a
request to update the record ('new') and store the difference
in a separate database.
See test_diff.py for expected output for each method.
Will need recursive methods for nested dicts
Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}
Must not convert dicts to sets. Because of the billions of records that will be compared,
conversion to sets, then back to dicts would drastically slow down the process.
"""
def added(self, old, new):
pass
def modified(self, old, new):
pass
def deleted(self, old, new):
pass
def difference(self, old, new):
pass
def has_key(self, key):
"""explicitly defined for python 2/3 compatability"""
try:
self[key]
except KeyError:
return False
return True
这是单元测试
import unittest
from diff import Diff
class TestDiff(unittest.TestCase):
def setUp(self):
self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
self.d = Diff()
def test_added_logic(self):
result = self.d.added(self.old_dict, self.new_dict)
expected = {"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}
self.assertEqual(result, expected)
def test_modified_logic(self):
result = self.d.modified(self.old_dict, self.new_dict)
expected = {"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}
self.assertEqual(result, expected)
def test_deleted_logic(self):
result = self.d.deleted(self.old_dict, self.new_dict)
expected = {"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
self.assertEqual(result, expected)
def test_difference_logic(self):
result = self.d.difference(self.old_dict, self.new_dict)
expected = {"results":[
{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
]
}
self.assertEqual(result, expected)
if __name__ == "__main__":
unittest.main()
答案 0 :(得分:0)
我对它进行了刺穿并通过了单元测试。在Python 2.7中。但是我将单元测试期望更改为更加统一,现在它总是想要一个dict对象列表。否则前三个测试对多个差异没有意义,最后一个测试不一致。
欢迎提出改进意见......
class Diff():
"""
Show the difference between two given dicts 'old' and 'new'.
Intended to be used in an auditing system that checks existing
records in a nosql database ('old') and compares them with a
request to update the record ('new') and store the difference
in a separate database.
See test_diff.py for expected output for each method.
Will need recursive methods for nested dicts
Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}
Must not convert dicts to sets. Because of the billions of records that will be compared,
conversion to sets, then back to dicts would drastically slow down the process.
"""
def __init__(self):
self.depth = []
self.results = []
def added(self, old, new):
for key in new:
if key not in old:
self.results.append({
'operation': 'ADDED',
'field': '.'.join(self.depth) + '.' + str(key),
'new': new[key]
})
elif type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.added(old[key], new[key])
if self.depth:
self.depth.pop()
else:
return self.results
def modified(self, old, new):
for key in [key for key in new if key in old]:
if type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.modified(old[key], new[key])
elif old[key] != new[key]:
self.results.append({
'operation': 'MODIFIED',
'field': '.'.join(self.depth) + '.' + str(key),
'old': old[key],
'new': new[key]
})
if self.depth:
self.depth.pop()
else:
return self.results
def deleted(self, old, new):
for key in old:
if key not in new:
self.results.append({
'operation': 'DELETED',
'field': '.'.join(self.depth) + '.' + str(key),
'old': old[key]
})
elif type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.deleted(old[key], new[key])
if self.depth:
self.depth.pop()
else:
return self.results
def difference(self, old, new):
self.added(old, new)
self.modified(old, new)
self.deleted(old, new)
return self.results
import unittest
from diff import Diff
class TestDiff(unittest.TestCase):
def setUp(self):
self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
self.d = Diff()
def test_added_logic(self):
result = self.d.added(self.old_dict, self.new_dict)
expected = [{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}]
self.assertEqual(result, expected)
def test_modified_logic(self):
result = self.d.modified(self.old_dict, self.new_dict)
expected = [{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}]
self.assertEqual(result, expected)
def test_deleted_logic(self):
result = self.d.deleted(self.old_dict, self.new_dict)
expected = [{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}]
self.assertEqual(result, expected)
def test_difference_logic(self):
result = self.d.difference(self.old_dict, self.new_dict)
expected = [
{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
]
self.assertEqual(result, expected)
if __name__ == "__main__":
unittest.main()
答案 1 :(得分:0)
感谢mike.k解决此问题 我已经修改了一些代码。 我添加了平坦的,一级深度的测试。 我还修复了平坦的dicts深度之前的点。 最后,因为输出将转换为json 为了存储在数据库中,我又添加了一个方法 将最终差异转换为字典。
class Diff():
"""
Show the difference between two given dicts 'old' and 'new'.
Intended to be used in an auditing system that checks existing
records in a nosql database ('old') and compares them with a
request to update the record ('new') and store the difference
in a separate database.
See test_diff.py for expected output for each method.
Will need recursive methods for nested dicts
Use dot notation for the path to the key that is different e.g. {"a.b.c": "modified value"}
Must not convert dicts to sets. Because of the billions of records that will be compared,
conversion to sets, then back to dicts would drastically slow down the process.
"""
def __init__(self):
self.depth = []
self.results = []
def added(self, old, new):
for key in new:
if key not in old:
if self.depth:
self.results.append({
'operation': 'ADDED',
'field': '.'.join(self.depth) + '.' + str(key),
'new': new[key]
})
else:
self.results.append({
'operation': 'ADDED',
'field': str(key),
'new': new[key]
})
else:
if type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.added(old[key], new[key])
if self.depth:
self.depth.pop()
else:
return self.results
def modified(self, old, new):
for key in [key for key in new if key in old]:
if type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.modified(old[key], new[key])
elif old[key] != new[key]:
if self.depth:
self.results.append({
'operation': 'MODIFIED',
'field': '.'.join(self.depth) + '.' + str(key),
'old': old[key],
'new': new[key]
})
else:
self.results.append({
'operation': 'MODIFIED',
'field': str(key),
'old': old[key],
'new': new[key]
})
if self.depth:
self.depth.pop()
else:
return self.results
def deleted(self, old, new):
for key in old:
if key not in new:
if self.depth:
self.results.append({
'operation': 'DELETED',
'field': '.'.join(self.depth) + '.' + str(key),
'old': old[key]
})
else:
self.results.append({
'operation': 'DELETED',
'field': str(key),
'old': old[key]
})
else:
if type(old[key]) == dict and type(new[key]) == dict:
self.depth.append(str(key))
self.deleted(old[key], new[key])
if self.depth:
self.depth.pop()
else:
return self.results
def combine_results(self, old, new):
self.added(old, new)
self.modified(old, new)
self.deleted(old, new)
return self.results
def difference(self, old, new):
d = {}
d['difference'] = self.combine_results(old, new)
return d
这是unittest
import unittest
from diff import Diff
class TestDiff(unittest.TestCase):
def setUp(self):
self.old_dict = {"dict1":{"dict2": {"x":"A","z":"d"}}}
self.new_dict = {"dict1":{"dict2": {"x":"C","y":"B"}}}
self.old_flat = {"a":"a", "z":"z"}
self.new_flat = {"a":"f", "b":"b"}
self.d = Diff()
def test_added_logic(self):
result = self.d.added(self.old_dict, self.new_dict)
expected = [{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"}]
self.assertEqual(result, expected)
def test_added_flat_dict(self):
result = self.d.added(self.old_flat, self.new_flat)
expected = [{'field': 'b', 'operation': 'ADDED', 'new': 'b'}]
self.assertEqual(result, expected)
def test_modified_logic(self):
result = self.d.modified(self.old_dict, self.new_dict)
expected = [{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"}]
self.assertEqual(result, expected)
def test_modified_flat_dict(self):
result = self.d.modified(self.old_flat, self.new_flat)
expected = [{'field': 'a', 'operation': 'MODIFIED', 'new': 'f', 'old': 'a'}]
self.assertEqual(result, expected)
def test_deleted_logic(self):
result = self.d.deleted(self.old_dict, self.new_dict)
expected = [{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}]
self.assertEqual(result, expected)
def test_deleted_flat_dict(self):
result = self.d.deleted(self.old_flat, self.new_flat)
expected = [{'field': 'z', 'operation': 'DELETED', 'old': 'z'}]
self.assertEqual(result, expected)
def test_difference_logic(self):
result = self.d.combine_results(self.old_dict, self.new_dict)
expected = [
{"operation": "ADDED", "field": "dict1.dict2.y", "new": "B"},
{"operation": "MODIFIED", "field": "dict1.dict2.x", "old": "A", "new": "C"},
{"operation": "DELETED", "field": "dict1.dict2.z", "old": "d"}
]
self.assertEqual(result, expected)
if __name__ == "__main__":
unittest.main()
答案 2 :(得分:0)
基于@ mike-k回答我做了一个不需要新类的方法
def dict_diff(dict1: dict, dict2: dict, operation='modified', keys=None):
"""
method to find the difference between (nested) dicts
:param dict1: first dict to compare
:param dict2: second dict to compare
:param method: in ('added', 'deleted', 'modified')
'added' checks for keys in `dict2` which do not occur in `dict1`
'deleted' checks for keys in `dict1` which do not occur in `dict2`
'modified' checks for all changes
:param keys: this method uses recursion to check in nested dicts. `keys` is a list of the levels in the nesting
does not need to be specified by the used
:return: a sorted list of the differences between `dict1` and `dict2` in the form of a list of namedtuples
namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
`key` is a list of the keys of the changed element
`operation` is in ('added', 'deleted', 'modified')
"""
acceptable_methods = ('added', 'deleted', 'modified')
if operation not in acceptable_methods:
raise ValueError('parameter `operation` should be on of %s' % str(acceptable_methods))
dict_diff_result = namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
if keys is None:
keys = list()
diffs = list()
dict1_keyset = set(dict1)
dict2_keyset = set(dict2)
in_dict1_and_dict2 = dict1_keyset & dict2_keyset
in_dict1_but_not_dict2 = sorted(dict1_keyset - in_dict1_and_dict2)
in_dict2_but_not_dict1 = sorted(dict2_keyset - in_dict1_and_dict2)
for key in sorted(in_dict1_and_dict2):
key_list = keys + [key]
dict1_val = dict1[key]
dict2_val = dict2[key]
if isinstance(dict1_val, dict) and isinstance(dict2_val, dict):
results = dict_diff(dict1_val, dict2_val, keys=key_list, operation=operation)
diffs += results
elif dict1_val != dict2_val and operation == 'modified':
diffs.append(dict_diff_result(key_list, dict1_val, dict2_val, 'modified'))
if in_dict1_but_not_dict2 and operation in ('deleted', 'modified',):
diffs += [dict_diff_result(keys + [key], dict1[key], None, 'deleted') for key in in_dict1_but_not_dict2]
if in_dict2_but_not_dict1 and operation in ('added', 'modified',):
diffs += [dict_diff_result(keys + [key], None, dict2[key], 'added') for key in in_dict2_but_not_dict1]
return diffs
如果您希望获得更多性能,但无法进行排序,则可以删除sorted()
或使用yield
而不是return
来实现此方法,以获得generator
而不是list
"""
>>> dict1 = {"key1":{"key2": {'a': 1, "x":"A","z":"d"}}, 'key3': 1, }
>>> dict2 = {"key1":{"key2": {'a': 1, "x":"C","y":"B"}}, 'key4': 1, }
>>> dict_diff(dict1, dict2) # doctest: +NORMALIZE_WHITESPACE
[DictDiffResult(key=['key1', 'key2', 'x'], dict1_value='A', dict2_value='C', operation='modified'),
DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted'),
DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]
>>> dict_diff(dict1, dict1) # doctest: +NORMALIZE_WHITESPACE
[]
>>> dict_diff(dict1, dict2, operation='modified') # doctest: +NORMALIZE_WHITESPACE
[DictDiffResult(key=['key1', 'key2', 'x'], dict1_value='A', dict2_value='C', operation='modified'),
DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted'),
DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]
>>> dict_diff(dict1, dict2, operation='deleted') # doctest: +NORMALIZE_WHITESPACE
[DictDiffResult(key=['key1', 'key2', 'z'], dict1_value='d', dict2_value=None, operation='deleted'),
DictDiffResult(key=['key3'], dict1_value=1, dict2_value=None, operation='deleted')]
>>> dict_diff(dict1, dict2, operation='added') # doctest: +NORMALIZE_WHITESPACE
[DictDiffResult(key=['key1', 'key2', 'y'], dict1_value=None, dict2_value='B', operation='added'),
DictDiffResult(key=['key4'], dict1_value=None, dict2_value=1, operation='added')]
>>> dict_diff(dict1, dict2, operation='wrong') # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
Traceback (most recent call last):
...
ValueError: parameter `operation` should be on of ('added', 'deleted', 'modified')
"""
set
如果您确实需要没有set
的实施,则可以执行以下操作
def dict_diff(dict1: dict, dict2: dict, operation='modified', keys=None):
acceptable_methods = ('added', 'deleted', 'modified')
if operation not in acceptable_methods:
raise ValueError('parameter `operation` should be on of %s' % str(acceptable_methods))
dict_diff_result = namedtuple('DictDiffResult', ('key', 'dict1_value', 'dict2_value', 'operation'))
if keys is None:
keys = list()
diffs = list()
for key in dict1:
key_list = keys + [key]
if key not in dict2:
if operation in ('deleted', 'modified',):
diffs.append(dict_diff_result(keys + [key], dict1[key], None, 'deleted'))
else:
dict1_val = dict1[key]
dict2_val = dict2[key]
if isinstance(dict1_val, dict) and isinstance(dict2_val, dict):
results = dict_diff(dict1_val, dict2_val, keys=key_list, operation=operation)
diffs += results
elif dict1_val != dict2_val and operation == 'modified':
diffs.append(dict_diff_result(key_list, dict1_val, dict2_val, 'modified'))
for key in dict2:
if key not in dict1:
if operation in ('added', 'modified',):
diffs.append(dict_diff_result(keys + [key], None, dict2[key], 'added'))
return diffs
由于这会迭代整个dict2,对于具有大量相似值的长dicts来说,这可能会更慢
关于第二种方法的一个小细节是它不再排序,但可以很容易地改变