我想比较管道的运行结果。通过不同的数据获取具有相同架构的json之间的差异。
Run1 JSON
{"doc_id": 1, "entity": "Anthony", "start": 0, "end": 7}
{"doc_id": 1, "entity": "New York", "start": 30, "end": 38} # Missing from Run2
{"doc_id": 2, "entity": "Istanbul", "start": 0, "end": 8}
Run2 JSON
{"doc_id": 1, "entity": "Anthony", "start": 0, "end": 7} # same as in Run1
{"doc_id": 2, "entity": "Istanbul", "start": 0, "end": 10} # different end span
{"doc_id": 2, "entity": "Karim", "start": 10, "end": 15} # added in Run2, not in Run1
基于这里的答案,我的方法是从json值中创建一个元组,然后使用由一些json值组成的大型复合键进行联合分组:How do I perform a "diff" on two Sources given a key using Apache Beam Python SDK?
是否有更好的方法与Beam区分json?
基于链接答案的代码:
def make_kv_pair(x):
if x and isinstance(x, basestring):
x = json.loads(x)
""" Output the record with the x[0]+x[1] key added."""
key = tuple((x[dict_key] for dict_key in ["doc_id", "entity"]))
return (key, x)
class FilterDoFn(beam.DoFn):
def process(self, (key, values)):
table_a_value = list(values['table_a'])
table_b_value = list(values['table_b'])
if table_a_value == table_b_value:
yield pvalue.TaggedOutput('unchanged', key)
elif len(table_a_value) < len(table_b_value):
yield pvalue.TaggedOutput('added', key)
elif len(table_a_value) > len(table_b_value):
yield pvalue.TaggedOutput('removed', key)
elif table_a_value != table_b_value:
yield pvalue.TaggedOutput('changed', key)
管道代码:
table_a = (p | 'ReadJSONRun1' >> ReadFromText("run1.json")
| 'SetKeysRun1' >> beam.Map(make_kv_pair))
table_b = (p | 'ReadJSONRun2' >> ReadFromText("run2.json")
| 'SetKeysRun2' >> beam.Map(make_kv_pair))
joined_tables = ({'table_a': table_a, 'table_b': table_b}
| beam.CoGroupByKey())
output_types = ['changed', 'added', 'removed', 'unchanged']
key_collections = (joined_tables
| beam.ParDo(FilterDoFn()).with_outputs(*output_types))
# Now you can handle each output
key_collections.unchanged | "WriteUnchanged" >> WriteToText("unchanged/", file_name_suffix="_unchanged.json.gz")
key_collections.changed | "WriteChanged" >> WriteToText("changed/", file_name_suffix="_changed.json.gz")
key_collections.added | "WriteAdded" >> WriteToText("added/", file_name_suffix="_added.json.gz")
key_collections.removed | "WriteRemoved" >> WriteToText("removed/", file_name_suffix="_removed.json.gz")