数据流:覆盖Shuffler使用的编码器

时间:2018-01-29 20:56:44

标签: google-cloud-dataflow apache-beam

我们正在使用Dataflow对Python对象运行一组操作。由于我们的对象不具备Pickle能力,因此我们使用DillCoder,这通常效果很好。我们可以覆盖编码器注册表的_fallback_coder,或者将编码器提供给Reader或Writer - 也可以。

但是,当我们的对象需要洗牌时,我们会收到一个错误,即我们的对象无法从PickleCoder中腌制。

我认为Dataflow Worker Shuffler没有考虑编码器注册表,而是始终使用PickleCoder。是对的吗?有没有办法覆盖它?

下面的完整堆栈跟踪。职位ID为2018-01-25_15_21_50-3249437741466877997

由于

Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 582, in do_work
    work_executor.execute()
  File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py", line 167, in execute
    op.start()
  File "dataflow_worker/shuffle_operations.py", line 49, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
    def start(self):
  File "dataflow_worker/shuffle_operations.py", line 50, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
    with self.scoped_start_state:
  File "dataflow_worker/shuffle_operations.py", line 65, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
    with self.shuffle_source.reader() as reader:
  File "dataflow_worker/shuffle_operations.py", line 69, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
    self.output(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output
    cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
    cython.cast(Operation, consumer).process(windowed_value)
  File "dataflow_worker/shuffle_operations.py", line 233, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
    self.output(wvalue.with_value((k, wvalue.value)))
  File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output
    cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
    cython.cast(Operation, consumer).process(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process
    with self.scoped_process_state:
  File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process
    self.dofn_receiver.receive(o)
  File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive
    self.process(windowed_value)
  File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process
    self._reraise_augmented(exn)
  File "apache_beam/runners/common.py", line 415, in apache_beam.runners.common.DoFnRunner._reraise_augmented
    raise
  File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process
    self.do_fn_invoker.invoke_process(windowed_value)
  File "apache_beam/runners/common.py", line 281, in apache_beam.runners.common.PerWindowInvoker.invoke_process
    self._invoke_per_window(windowed_value)
  File "apache_beam/runners/common.py", line 306, in apache_beam.runners.common.PerWindowInvoker._invoke_per_window
    self.output_processor.process_outputs(
  File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs
    self.main_receivers.receive(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
    cython.cast(Operation, consumer).process(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process
    with self.scoped_process_state:
  File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process
    self.dofn_receiver.receive(o)
  File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive
    self.process(windowed_value)
  File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process
    self._reraise_augmented(exn)
  File "apache_beam/runners/common.py", line 431, in apache_beam.runners.common.DoFnRunner._reraise_augmented
    raise new_exn, None, original_traceback
  File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process
    self.do_fn_invoker.invoke_process(windowed_value)
  File "apache_beam/runners/common.py", line 189, in apache_beam.runners.common.SimpleInvoker.invoke_process
    self.output_processor.process_outputs(
  File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs
    self.main_receivers.receive(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 84, in apache_beam.runners.worker.operations.ConsumerSet.receive
    self.update_counters_start(windowed_value)
  File "apache_beam/runners/worker/operations.py", line 90, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
    self.opcounter.update_from(windowed_value)
  File "apache_beam/runners/worker/opcounters.py", line 63, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
    self.do_sample(windowed_value)
  File "apache_beam/runners/worker/opcounters.py", line 81, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
    self.coder_impl.get_estimated_size_and_observables(windowed_value))
  File "apache_beam/coders/coder_impl.py", line 730, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
    def get_estimated_size_and_observables(self, value, nested=False):
  File "apache_beam/coders/coder_impl.py", line 739, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
    self._value_coder.get_estimated_size_and_observables(
  File "apache_beam/coders/coder_impl.py", line 260, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.get_estimated_size_and_observables
    self.encode_to_stream(value, out, nested)
  File "apache_beam/coders/coder_impl.py", line 298, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.encode_to_stream
    self.fallback_coder_impl.encode_to_stream(value, stream, nested)
  File "apache_beam/coders/coder_impl.py", line 154, in apache_beam.coders.coder_impl.CallbackCoderImpl.encode_to_stream
    return stream.write(self._encoder(value), nested)
  File "/usr/local/lib/python2.7/dist-packages/apache_beam/coders/coders.py", line 497, in <lambda>
    lambda x: dumps(x, HIGHEST_PROTOCOL), pickle.loads)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed [while running 'run reversion/FlatMap(run_reversion)']

编辑:

这是一个可重复的例子。职位ID为2018-01-30_11_53_22-14709945294406059840

import zlib
import argparse
from itertools import chain

import apache_beam as beam
import apache_beam.coders.coders as coders
from apache_beam.coders.coders import (DillCoder, coder_impl, maybe_dill_dumps,
                                       maybe_dill_loads)
from apache_beam.coders.typecoders import CoderRegistry, FirstOf
from apache_beam.options.pipeline_options import PipelineOptions


class ObjectCoder(DillCoder):
    """
    Coder that allows multi-line pickles to be read, and compresses the output
    After an object is pickled, the bytes are encoded as `unicode_escape`,
    meaning newline characters (`\n`) aren't in the string.

    # but now we're doing compression, do we need to do the newlines?

    Previously, the presence of newline characters these confues the Dataflow
    reader, as it can't discriminate between a new object and a new line
    within a pickle string
    """

    def _create_impl(self):
        return coder_impl.CallbackCoderImpl(
            dill_compress_dumps, dill_compress_loads)


def dill_compress_dumps(stream):
    # in Py3 this needs to be `unicode_escape`
    return zlib.compress(maybe_dill_dumps(stream)).encode('string_escape')


def dill_compress_loads(stream):
    # in Py3 this needs to be `unicode_escape`
    decoded = stream.decode('string_escape')
    try:
        decompressed = zlib.decompress(decoded)
    except zlib.error:
        decompressed = stream
    return maybe_dill_loads(decompressed)


def add_obj_to_coder_registry(coder_registry):
    """
    Add Dill to the middle of the coder registry - after the deterministic
    coders but before the Pickle coder
    Use like: `add_obj_to_coder_registry(beam.coders.registry)`
    """
    assert isinstance(coder_registry, CoderRegistry)

    primitives_coder = coders.FastPrimitivesCoder(fallback_coder=ObjectCoder())

    # https://github.com/apache/beam/blob/master/sdks/python/apacpycohe_beam/coders/typecoders.py#L93  # noqa
    fallback_coders = [coders.ProtoCoder, primitives_coder]

    coder_registry._fallback_coder = FirstOf(fallback_coders)


known_args, unknown_args = argparse.ArgumentParser().parse_known_args()

default_options = dict(
    runner='DataflowRunner',
    project='PROJECT_ID',
    temp_location='gs://BUCKET_NAME/dataflow/temp/',
    staging_location='gs://BUCKET_NAME/dataflow/staging/',
    max_num_workers='100')

# from https://github.com/apache/incubator-airflow/blob/master/airflow/contrib/hooks/gcp_dataflow_hook.py  # noqa
default_options_args = ['--{}={}'.format(attr, value)
                        for attr, value in default_options.items()]

default_args = chain(
    default_options_args,
    ['--save_main_session'])

args = chain(unknown_args, default_args)

add_obj_to_coder_registry(beam.coders.typecoders.registry)


def produce_unpicklable():
    def double(x):
        return x * 2
    return double


p = beam.Pipeline(options=PipelineOptions(list(args)))

items = p | beam.Create([x for x in range(10)])

unpickleable = items | beam.Map(lambda x: (x, produce_unpicklable()))

shuffle = unpickleable | 'use shuffler' >> beam.GroupByKey()

r = p.run()

0 个答案:

没有答案