任何帮助将不胜感激!!!
我使用数据流来处理H5(HDF5 format)文件。
为此,我创建了一个基于juliaset示例的setup.py文件,该文件在其他一个票证中引用。我唯一的变化是要安装的软件包列表:
REQUIRED_PACKAGES = [
'numpy',
'h5py',
'pandas',
'tables',
]
管道如下:
import numpy as np
import h5py
import pandas as pd
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class ReadGcsBlobs(beam.DoFn):
def process(self, element, *args, **kwargs):
from apache_beam.io.gcp import gcsio
gcs = gcsio.GcsIO()
yield (element, gcs.open(element).read())
class H5Preprocess(beam.DoFn):
def process(self, element):
logging.info('**********starting to read H5')
h5py.File(element, 'r')
logging.info('**********finished reading H5')
expression = hdf['/data/']['expression']
logging.info('**********finished reading the expression node')
np_expression = expression[1:2,1:2]
logging.info('**********subset the expression to numpy 2x2')
yield (element, np_expression)
def run(argv=None):
pipeline_options = PipelineOptions(argv)
parser = argparse.ArgumentParser(description="read from h5 blog and write to file")
#parser.add_argument('--input',help='Input for the pipeline', default='gs://archs4/human_matrix.h5')
#parser.add_argument('--output',help='output for the pipeline',default='gs://archs4/output.txt')
#known_args, pipeline_args = parser.parse_known_args(argv)
logging.info('**********finish with the parser')
# what does the args is relevant for? when the parameters are known_args.input and known_args.output
#with beam.Pipeline(options=PipelineOptions(argv=pipeline_args)) as p:
with beam.Pipeline(options=pipeline_options) as p:
(p
| 'Initialize' >> beam.Create(['gs://archs4/human_matrix.h5'])
| 'Read-blobs' >> beam.ParDo(ReadGcsBlobs())
| 'pre-process' >> beam.ParDo(H5Preprocess())
| 'write' >> beam.io.WriteToText('gs://archs4/outputData.txt')
)
p.run()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
执行命令如下:
python beam_try1.py --job-name beam-try1 --project orielresearch-188115 --runner DataflowRunner --setup_file ./setup.py --temp_location=gs://archs4/tmp --staging_location gs://archs4/staging
和管道错误如下:
(5a4c72cfc5507714): Workflow failed. Causes: (3bde8bf810c652b2): S04:Initialize/Read+Read-blobs+pre-process+write/Write/WriteImpl/WriteBundles/WriteBundles+write/Write/WriteImpl/Pair+write/Write/WriteImpl/WindowInto(WindowIntoFn)+write/Write/WriteImpl/GroupByKey/Reify+write/Write/WriteImpl/GroupByKey/Write failed., (7b4a7abb1a692d12): A work item was attempted 4 times without success. Each time the worker eventually lost contact with the service. The work item was attempted on:
beamapp-eila-0213182449-2-02131024-1621-harness-vf4f,
beamapp-eila-0213182449-2-02131024-1621-harness-vf4f,
beamapp-eila-0213182449-2-02131024-1621-harness-vf4f,
beamapp-eila-0213182449-2-02131024-1621-harness-vf4f
请问您需要解决哪些问题?
谢谢, eilalan
答案 0 :(得分:0)
您是否尝试在本地跑步者中运行数据的子集?这可能会给你更多关于出了什么问题的信息。