我正在尝试使用Beam从文本目录中获取10个最常用的单词。当我返回单词并计数值时,它不会打印前10个常用单词,但会显示所有键值。我在使用beam.combiners.Top.Of('top',10,key = lambda(word,c ):c)。
我正在使用Apache Beam框架在Google云上运行代码 def run(argv = None): “”“主入口点;定义并运行单词计数管道。”“” 解析器= argparse.ArgumentParser() parser.add_argument('-input', dest ='输入', 默认='gs:// dataflow-samples / shakespeare /', help ='输入要处理的文件。') parser.add_argument('-output', dest ='输出', 默认='/ home ///// output1-', help ='输出文件以将结果写入。”) known_args,pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
lines = p | 'read' >> ReadFromText(known_args.input)
# Count the occurrences of each word.
def count_ones(word_ones):
(word, ones) = word_ones
return (word, sum(ones))
counts = (lines
| 'split' >> (beam.ParDo(WordExtractingDoFn())
.with_output_types(unicode))
| 'pair_with_one' >> beam.Map(lambda x: (x, 1))
| 'group' >> beam.GroupByKey()
| 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
| 'top' >> beam.combiners.Top.Of('top', 10, key=lambda (word, c): c)
| 'expand' >> beam.FlatMap(lambda word_count: word_count)
# Format the counts into a PCollection of strings.
def format_result(word_count):
(word, count) = word_count
return '%s: %d' % (word, count)
out = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word,c))
out | 'write' >> WriteToText(known_args.output)