我正在建立将度量标准写入BigQuery的计算管道。在流水线的某一点上,我有超过5000万个键值对需要按键进行分组,并且需要对分组值进行计算。我正在将CombinePerKey
与自定义CombineFn
累加器一起使用。 CombinePerKey
完成后,累加值的结果将写入BigQuery。
当管道在Dataflow上运行时,工作分散在50多名工人上。这导致将40多个具有相同键和不同值的指标写入BigQuery。当输入足够小以在一个worker上运行时,它会产生我期望的输出(每个唯一键只有一个输出,且具有正确的聚合值。)我该如何设计该值,以便即使是跨多个工人?
这是完整的管道:
def run(argv=None):
"""Runs the recidivism calculation pipeline."""
# Workaround to load SQLAlchemy objects at start of pipeline. This is
# necessary because the BuildRootEntity function tries to access attributes
# of relationship properties on the SQLAlchemy room_schema_class before they
# have been loaded. However, if *any* SQLAlchemy objects have been
# instantiated, then the relationship properties are loaded and their
# attributes can be successfully accessed.
_ = schema.StatePerson()
# Parse command-line arguments
known_args, pipeline_args = parse_arguments(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
# Get pipeline job details
all_pipeline_options = pipeline_options.get_all_options()
query_dataset = all_pipeline_options['project'] + '.' + known_args.input
with beam.Pipeline(argv=pipeline_args) as p:
# Get StatePersons
persons = (p
| 'Load Persons' >>
BuildRootEntity(dataset=query_dataset,
data_dict=None,
root_schema_class=schema.StatePerson,
root_entity_class=entities.StatePerson,
unifying_id_field='person_id',
build_related_entities=True))
# Get StateIncarcerationPeriods
incarceration_periods = (p
| 'Load IncarcerationPeriods' >>
BuildRootEntity(
dataset=query_dataset,
data_dict=None,
root_schema_class=
schema.StateIncarcerationPeriod,
root_entity_class=
entities.StateIncarcerationPeriod,
unifying_id_field='person_id',
build_related_entities=True))
# Get StateSupervisionViolationResponses
supervision_violation_responses = \
(p
| 'Load SupervisionViolationResponses' >>
BuildRootEntity(
dataset=query_dataset,
data_dict=None,
root_schema_class=schema.StateSupervisionViolationResponse,
root_entity_class=entities.StateSupervisionViolationResponse,
unifying_id_field='person_id',
build_related_entities=True
))
# Group StateIncarcerationPeriods and StateSupervisionViolationResponses
# by person_id
incarceration_periods_and_violation_responses = (
{'incarceration_periods': incarceration_periods,
'violation_responses': supervision_violation_responses}
| 'Group StateIncarcerationPeriods to '
'StateSupervisionViolationResponses' >>
beam.CoGroupByKey()
)
# Set the fully hydrated StateSupervisionViolationResponse entities on
# the corresponding StateIncarcerationPeriods
incarceration_periods_with_source_violations = (
incarceration_periods_and_violation_responses
| 'Set hydrated StateSupervisionViolationResponses on '
'the StateIncarcerationPeriods' >>
beam.ParDo(SetViolationResponseOnIncarcerationPeriod()))
# Group each StatePerson with their StateIncarcerationPeriods
person_and_incarceration_periods = (
{'person': persons,
'incarceration_periods':
incarceration_periods_with_source_violations}
| 'Group StatePerson to StateIncarcerationPeriods' >>
beam.CoGroupByKey()
)
# Identify ReleaseEvents events from the StatePerson's
# StateIncarcerationPeriods
person_events = (
person_and_incarceration_periods |
'Get Release Events' >>
GetReleaseEvents())
# Get dimensions to include and methodologies to use
inclusions, methodologies = dimensions_and_methodologies(known_args)
# Get pipeline job details for accessing job_id
all_pipeline_options = pipeline_options.get_all_options()
# Add timestamp for local jobs
job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f')
all_pipeline_options['job_timestamp'] = job_timestamp
# Get recidivism metrics
recidivism_metrics = (person_events
| 'Get Recidivism Metrics' >>
GetRecidivismMetrics(
pipeline_options=all_pipeline_options,
inclusions=inclusions))
filter_metrics_kwargs = {'methodologies': methodologies}
# Filter out unneeded metrics
final_recidivism_metrics = (
recidivism_metrics
| 'Filter out unwanted metrics' >>
beam.ParDo(FilterMetrics(), **filter_metrics_kwargs))
# Convert the metrics into a format that's writable to BQ
writable_metrics = (final_recidivism_metrics
| 'Convert to dict to be written to BQ' >>
beam.ParDo(
RecidivismMetricWritableDict()).with_outputs(
'rates', 'counts', 'liberties'))
# Write the recidivism metrics to the output tables in BigQuery
rates_table = known_args.output + '.recidivism_rate_metrics'
counts_table = known_args.output + '.recidivism_count_metrics'
liberty_table = known_args.output + '.recidivism_liberty_metrics'
_ = (writable_metrics.rates
| f"Write rate metrics to BQ table: {rates_table}" >>
beam.io.WriteToBigQuery(
table=rates_table,
create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
_ = (writable_metrics.counts
| f"Write count metrics to BQ table: {counts_table}" >>
beam.io.WriteToBigQuery(
table=counts_table,
create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
_ = (writable_metrics.liberties
| f"Write liberty metrics to BQ table: {liberty_table}" >>
beam.io.WriteToBigQuery(
table=liberty_table,
create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
产生大量键值对然后进行CombinePerKey
的转换是GetRecidivismMetrics
:
class GetRecidivismMetrics(beam.PTransform):
"""Transforms a StatePerson and ReleaseEvents into RecidivismMetrics."""
def __init__(self, pipeline_options: Dict[str, str],
inclusions: Dict[str, bool]):
super(GetRecidivismMetrics, self).__init__()
self._pipeline_options = pipeline_options
self.inclusions = inclusions
def expand(self, input_or_inputs):
# Calculate recidivism metric combinations from a StatePerson and their
# ReleaseEvents
recidivism_metric_combinations = (
input_or_inputs
| 'Map to metric combinations' >>
beam.ParDo(CalculateRecidivismMetricCombinations(),
**self.inclusions).with_outputs('counts', 'rates',
'liberties'))
# Calculate the recidivism count values for the metrics combined by key
counts_with_sums = (recidivism_metric_combinations.counts
| 'Calculate recidivism counts values' >>
beam.CombinePerKey(SumFn()))
# Calculate the recidivism rate values for the metrics combined by key
rates_with_values = (recidivism_metric_combinations.rates
| 'Calculate recidivism rate values' >>
beam.CombinePerKey(RecidivismRateFn()))
# Calculate the recidivism liberty values for metrics combined by key
liberties_with_values = (recidivism_metric_combinations.liberties
| 'Calculate time at liberty values' >>
beam.CombinePerKey(RecidivismLibertyFn()))
# Produce the ReincarcerationRecidivismCountMetrics
counts_metrics = (counts_with_sums
| 'Produce recidivism count metrics' >>
beam.ParDo(
ProduceReincarcerationRecidivismCountMetric(),
**self._pipeline_options))
# Produce the ReincarcerationRecidivismRateMetrics
rates_metrics = (rates_with_values
| 'Produce recidivism rate metrics' >>
beam.ParDo(
ProduceReincarcerationRecidivismMetric(),
**self._pipeline_options))
# Produce the ReincarcerationRecidivismRateMetrics
liberties_metrics = (liberties_with_values
| 'Produce recidivism liberty metrics' >>
beam.ParDo(
ProduceReincarcerationRecidivismMetric(),
**self._pipeline_options))
# Merge the metric groups
merged_metrics = ((counts_metrics, rates_metrics, liberties_metrics)
| 'Merge counts, rates, and liberties metrics' >>
beam.Flatten())
# Return ReincarcerationRecidivismMetric objects
return merged_metrics
这是CalculateRecidivismMetricCombinations
DoFn:
class CalculateRecidivismMetricCombinations(beam.DoFn):
"""Calculates recidivism metric combinations."""
def process(self, element, *args, **kwargs):
"""Produces various recidivism metric combinations.
Sends the calculator the StatePerson entity and their corresponding
ReleaseEvents for mapping all recidivism combinations.
Args:
element: Tuple containing a StatePerson and their ReleaseEvents
**kwargs: This should be a dictionary with values for the
following keys:
- age_bucket
- gender
- stay_length_bucket
- release_facility
- race
- ethnicity
Yields:
Each recidivism metric combination, tagged by metric type.
"""
person, release_events = element
# Calculate recidivism metric combinations for this person and events
metric_combinations = \
calculator.map_recidivism_combinations(person,
release_events, kwargs)
# Return each of the recidivism metric combinations
for metric_combination in metric_combinations:
metric_key, value = metric_combination
# Freezing the metric key as a set so it is hashable
frozen_metric_key = frozenset(metric_key.items())
if metric_key.get('metric_type') == 'rate':
yield beam.pvalue.TaggedOutput('rates',
(frozen_metric_key, value))
elif metric_key.get('metric_type') == 'count':
yield beam.pvalue.TaggedOutput('counts',
(frozen_metric_key, value))
elif metric_key.get('metric_type') == 'liberty':
yield beam.pvalue.TaggedOutput('liberties',
(frozen_metric_key, value))
map_recidivism_combinations
函数每人产生成千上万的词典,这些词典成为键,随后在CombinePerKey
调用中在所有人中进行分组。
解决方案
似乎frozenset
在多台计算机上运行时没有确定的输出。因此,当在多个工作程序上运行数据流作业时,相同的密钥将转换为不同的冻结集。使用排序键将字典键转换为JSON字符串即可解决此问题。