我正在尝试了解Apache Beam Python SDK内部并且正在阅读一个类型检查部分。我写了一个非常简单的管道如下:
class AddZeroFn(beam.DoFn):
def process(self, element):
return [element + '0']
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args, pipeline_type_check=True)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as p:
numbers = p | beam.Create(['1', '2', '3'])
numbers = numbers | beam.ParDo(AddZeroFn())
numbers | 'Write' >> WriteToText('result.txt')
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
然后结果是
10
20
30
Okey,然后我添加了输入的类型检查,比如
numbers = numbers | beam.ParDo(AddZeroFn().with_input_types(str))
这没关系,如果我按预期将str
更改为int
,则会出错。
apache_beam.typehints.decorators.TypeCheckError:
Type hint violation for 'ParDo(AddZeroFn)':
requires <type 'int'> but got <type 'str'> for element
然而,当我添加输出类型检查时
numbers = numbers | beam.ParDo(AddZeroFn().with_output_types(float))
它没有任何问题。当我以为我会看到与输入typehint相同的错误时,不会引发错误。我误解了输出typehint的用法吗?如果是这样,我可以问with_output_type
预期会如何表现吗?
同样ptransform.type_check_inputs_or_outputs
的行如下
if pvalue_.element_type is None:
# TODO(robertwb): It's a bug that we ever get here. (typecheck)
continue
if hint and not typehints.is_consistent_with(pvalue_.element_type, hint):
at_context = ' %s %s' % (input_or_output, context) if context else ''
raise TypeCheckError(
'%s type hint violation at %s%s: expected %s, got %s' % (
input_or_output.title(), self.label, at_context, hint,
pvalue_.element_type))
但是如果我在第一个if块中设置了一些print语句,我发现在很多情况下程序进入该块,这意味着跳过了类型检查。 如果有人能帮助我理解关于typehint的当前正确行为,我将不胜感激。
Apache Beam的版本是2.2.0。 (我也测试过2.3.0dev0)
已添加(2017-12-27):
我一直在使用DirectRunner进行测试,但更改为DataflowRunner,现在看到以下错误。这是我们在设置with_output_types
时期望看到的内容吗?当我设置with_input_types(int)
时,它在将作业发送到Dataflow之前失败了,所以我认为在输出类型上也会发生同样的事情。
(7b12756b863da949): Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 582, in do_work
work_executor.execute()
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py", line 167, in execute
op.start()
File "dataflow_worker/native_operations.py", line 38, in dataflow_worker.native_operations.NativeReadOperation.start
def start(self):
File "dataflow_worker/native_operations.py", line 39, in dataflow_worker.native_operations.NativeReadOperation.start
with self.scoped_start_state:
File "dataflow_worker/native_operations.py", line 44, in dataflow_worker.native_operations.NativeReadOperation.start
with self.spec.source.reader() as reader:
File "dataflow_worker/native_operations.py", line 54, in dataflow_worker.native_operations.NativeReadOperation.start
self.output(windowed_value)
File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 431, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise new_exn, None, original_traceback
File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 189, in apache_beam.runners.common.SimpleInvoker.invoke_process
self.output_processor.process_outputs(
File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 84, in apache_beam.runners.worker.operations.ConsumerSet.receive
self.update_counters_start(windowed_value)
File "apache_beam/runners/worker/operations.py", line 90, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
self.opcounter.update_from(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 63, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
self.do_sample(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 81, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "apache_beam/coders/coder_impl.py", line 730, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
def get_estimated_size_and_observables(self, value, nested=False):
File "apache_beam/coders/coder_impl.py", line 739, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
self._value_coder.get_estimated_size_and_observables(
File "apache_beam/coders/coder_impl.py", line 99, in apache_beam.coders.coder_impl.CoderImpl.get_estimated_size_and_observables
return self.estimate_size(value, nested), []
File "apache_beam/coders/coder_impl.py", line 442, in apache_beam.coders.coder_impl.VarIntCoderImpl.estimate_size
return get_varint_size(value)
File "apache_beam/coders/stream.pyx", line 222, in apache_beam.coders.stream.get_varint_size
cpdef libc.stdint.int64_t get_varint_size(libc.stdint.int64_t value):
TypeError: an integer is required [while running 'ParDo(AddZeroFn)']
答案 0 :(得分:1)
指定的输出类型仅用于确保与后续转换的一致。例如,如果你写了
numbers2 = numbers | beam.ParDo(AddZeroFn().with_output_types(float))
numbers2 | beam.ParDo(...).with_input_types(str)
你会收到错误。