我们已经为版本2.11编写了Beam管道,但是当我们尝试使用DataflowOperator在Cloud Composer上运行它时,它使用的是SDK版本2.5。
在任何地方都可以指定使用2.11吗?
管道:
import argparse
import apache_beam as beam
from apache_beam.io.gcp import gcsio
from apache_beam.options.pipeline_options import PipelineOptions
import logging
from google.cloud import storage
import numpy as np
import pandas as pd
from sp_vcf import VCF
GCS_PREFIX = 'gs://'
def run(argv=None):
"""
Create and run Dataflow pipeline.
:return: none
"""
parser = argparse.ArgumentParser()
# Add the arguments needed for this specific Dataflow job.
parser.add_argument('--gvcf_bucket', dest='gvcf_bucket', required=True,
parser.add_argument('--parquet_bucket', dest='parquet_bucket', required=True,
help='Bucket on Google Cloud Storage to write parquet files to.')
parser.add_argument('--destination_table', dest='destination_table', required=True,
help='BigQuery table where transformed gvcfs should land')
parser.add_argument('--bq_dataset', dest='bq_dataset', required=True,
help='BigQuery dataset where destination table lives')
known_args, pipeline_args = parser.parse_known_args(argv)
# Add argument so that declared constants (ie, GCS_PREFIX)
# are available to Dataflow workers
pipeline_args.append('--save_main_session')
# Set options necessary for pipeline such as runner, project, region
p_opts = PipelineOptions(pipeline_args)
# Create and run beam pipeline object
with beam.Pipeline(options=p_opts) as p:
# Sink info
gvcf_bucket = known_args.gvcf_bucket
parquet_sink = known_args.parquet_bucket
# Set BigQuery Table spec for beam.io
# format is: dataset.table
table_spec = '{}.{}'.format(known_args.bq_dataset, known_args.destination_table)
# Get files to transform
files = get_files_to_transform(gvcf_bucket)
if files:
logging.info("Found {} files to transform".format((len(files))))
# Create pcollection of list of files to transform
gvcfs_to_transform = p | 'GetFiles' >> beam.Create(files)
# Read gvcfs from gcs into pcollection
parquets_to_load = gvcfs_to_transform | 'GvcfToParquet' >> beam.ParDo(GvcfToParquet(),
gvcf_bucket,
parquet_sink)
# Read Parquet files into pcollection
records = parquets_to_load | 'ReadParquet' >> beam.io.ReadAllFromParquet()
# Load all Parquet files into BigQuery
records | 'WriteParquetToBigQuery' >> beam.io.WriteToBigQuery(table_spec,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
else:
logging.info("No new files found")
if __name__ == '__main__':
run()
作曲家
import datetime
import os
from airflow import models, configuration
from airflow.operators import subdag_operator, dummy_operator, bash_operator
from airflow.contrib.operators import dataflow_operator
import googleapiclient.discovery
import json
from computation_query_dag import computation_dag
yesterday = datetime.datetime.combine(
datetime.datetime.today() - datetime.timedelta(1),
datetime.datetime.min.time())
DEFAULT_DAG_ARGS = {
'start_date': yesterday,
'retries': 0,
'project_id': models.Variable.get('gcp_project'),
'dataflow_default_options': {
'project': models.Variable.get('gcp_project'),
'temp_location': models.Variable.get('gcp_temp_location'),
'staging_location': models.Variable.get('gcp_staging_location'),
'runner': 'DataflowRunner',
# 'region': 'us-central1',
},
}
with models.DAG(dag_id='TestEngine',
description='A DAG for allele analytics workflow',
schedule_interval=None, default_args=DEFAULT_DAG_ARGS, start_date=yesterday) as dag:
dataflow_scripts = os.path.join(configuration.get('core', 'dags_folder'), 'pipeline')
# Args required for the ETL Dataflow job.
gvcf_dataflow_job_args = {
'gvcf_bucket': os.getenv('gvcf_bucket'),
'parquet_bucket': os.getenv('parquet_bucket'),
# 'job_name': os.getenv('gvcf_job_name'),
#'setup_file': os.path.join(dataflow_scripts, 'setup.py'),
'requirements_file': os.path.join(dataflow_scripts, 'requirements.txt'),
#'extra_package': os.path.join(dataflow_scripts, 'sp_vcf.tar.gz'),
'destination_table': os.getenv('call_table'),
'bq_dataset': os.getenv('bq_dataset'),
# 'py_file': os.path.join(dataflow_scripts, 'gvcf_pipeline.py')
}
# Dataflow task that will process and load.
dataflow_gvcf = dataflow_operator.DataFlowPythonOperator(
task_id="gvcf-etl-bigquery",
py_file=os.path.join(dataflow_scripts, 'gvcf_pipeline.py'),
# dataflow_default_options=DEFAULT_DAG_ARGS['dataflow_default_options'],
options=gvcf_dataflow_job_args,
# gcp_conn_id='google_cloud_default'
)
由于我能够在本地运行管道,所以我认为,如果我们指定Beam版本,则从Cloud Composer运行时也可以使用。
我们已将2.11安装到Composer环境中,但出现以下错误:
*** Reading remote log from gs://us-central1-test-env-96162c22-bucket/logs/AlleleAnalyticsEngine/gvcf-etl-bigquery/2019-04-17T22:33:07.326577+00:00/1.log.
[2019-04-17 22:33:18,604] {models.py:1361} INFO - Dependencies all met for <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [queued]>
[2019-04-17 22:33:18,611] {models.py:1361} INFO - Dependencies all met for <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [queued]>
[2019-04-17 22:33:18,613] {models.py:1573} INFO -
-------------------------------------------------------------------------------
Starting attempt 1 of
-------------------------------------------------------------------------------
[2019-04-17 22:33:18,659] {models.py:1595} INFO - Executing <Task(DataFlowPythonOperator): gvcf-etl-bigquery> on 2019-04-17T22:33:07.326577+00:00
[2019-04-17 22:33:18,660] {base_task_runner.py:118} INFO - Running: ['bash', '-c', u'airflow run AlleleAnalyticsEngine gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 --job_id 209 --raw -sd DAGS_FOLDER/main_dag.py --cfg_path /tmp/tmpGhGCxD']
[2019-04-17 22:33:20,148] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:20,147] {settings.py:176} INFO - setting.configure_orm(): Using pool settings. pool_size=5, pool_recycle=1800
[2019-04-17 22:33:21,073] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,072] {default_celery.py:80} WARNING - You have configured a result_backend of redis://airflow-redis-service.default.svc.cluster.local:6379/0, it is highly recommended to use an alternative result_backend (i.e. a database).
[2019-04-17 22:33:21,076] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,075] {__init__.py:51} INFO - Using executor CeleryExecutor
[2019-04-17 22:33:21,155] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,155] {app.py:51} WARNING - Using default Composer Environment Variables. Overrides have not been applied.
[2019-04-17 22:33:21,162] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,162] {configuration.py:516} INFO - Reading the config from /etc/airflow/airflow.cfg
[2019-04-17 22:33:21,174] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,174] {configuration.py:516} INFO - Reading the config from /etc/airflow/airflow.cfg
[2019-04-17 22:33:21,363] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,362] {models.py:271} INFO - Filling up the DagBag from /home/airflow/gcs/dags/main_dag.py
[2019-04-17 22:33:23,991] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:23,985] {cli.py:484} INFO - Running <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [running]> on host airflow-worker-796dcd49fc-x7fx6
[2019-04-17 22:33:24,237] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:24,236] {gcp_dataflow_hook.py:120} INFO - Running command: python2 /home/airflow/gcs/dags/pipeline/gvcf_pipeline.py --runner=DataflowRunner --parquet_bucket=parquet_sink_test --runner=DataflowRunner --region=us-central1 --labels=airflow-version=v1-10-1-composer --destination_table=calls_table_test --project=genomics-207320 --bq_dataset=allele_analytics --gvcf_bucket=gvcf_sink_test --temp_location=gs://aa_dataflow_staging/temp --job_name=gvcf-etl-bigquery-cfc96be4
[2019-04-17 22:33:25,214] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:25,213] {gcp_dataflow_hook.py:151} INFO - Start waiting for DataFlow process to complete.
[2019-04-17 22:33:43,821] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,820] {gcp_dataflow_hook.py:132} WARNING - Traceback (most recent call last):
[2019-04-17 22:33:43,822] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/home/airflow/gcs/dags/pipeline/gvcf_pipeline.py", line 339, in <module>
[2019-04-17 22:33:43,822] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery run()
[2019-04-17 22:33:43,823] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/home/airflow/gcs/dags/pipeline/gvcf_pipeline.py", line 335, in run
[2019-04-17 22:33:43,823] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery logging.info("No new files found")
[2019-04-17 22:33:43,824] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 426, in __exit__
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self.run().wait_until_finish()
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 406, in run
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self._options).run(False)
[2019-04-17 22:33:43,827] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 419, in run
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery return self.runner.run_pipeline(self, self._options)
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 408, in run_pipeline
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self.dataflow_client = apiclient.DataflowApplicationClient(options)
[2019-04-17 22:33:43,832] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 445, in __init__
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery response_encoding=get_response_encoding())
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py", line 58, in __init__
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery response_encoding=response_encoding)
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery TypeError: __init__() got an unexpected keyword argument 'response_encoding'
[2019-04-17 22:33:43,832] {models.py:1760} ERROR - DataFlow failed with return code 1
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_tas
result = task_copy.execute(context=context
File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execut
self.py_file, self.py_options
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflo
label_formatter
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrappe
return func(self, *args, **kwargs
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflo
job_id = _Dataflow(cmd).wait_for_done(
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_don
self._proc.returncode)
Exception: DataFlow failed with return code
[2019-04-17 22:33:43,840] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,832] {models.py:1760} ERROR - DataFlow failed with return code 1
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Traceback (most recent call last):
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_task
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery result = task_copy.execute(context=context)
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execute
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self.py_file, self.py_options)
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflow
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery label_formatter)
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrapper
[2019-04-17 22:33:43,844] {models.py:1791} INFO - Marking task as FAILED.
[2019-04-17 22:33:43,844] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery return func(self, *args, **kwargs)
[2019-04-17 22:33:43,845] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflow
[2019-04-17 22:33:43,845] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery job_id = _Dataflow(cmd).wait_for_done()
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_done
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self._proc.returncode))
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Exception: DataFlow failed with return code 1
[2019-04-17 22:33:43,848] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,844] {models.py:1791} INFO - Marking task as FAILED.
[2019-04-17 22:33:43,890] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Traceback (most recent call last):
[2019-04-17 22:33:43,891] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/bin/airflow", line 7, in <module>
[2019-04-17 22:33:43,891] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery exec(compile(f.read(), __file__, 'exec'))
[2019-04-17 22:33:43,892] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/bin/airflow", line 32, in <module>
[2019-04-17 22:33:43,892] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery args.func(args)
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/utils/cli.py", line 74, in wrapper
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery return f(*args, **kwargs)
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/bin/cli.py", line 490, in run
[2019-04-17 22:33:43,894] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery _run(args, dag, ti)
[2019-04-17 22:33:43,894] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/bin/cli.py", line 406, in _run
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery pool=args.pool,
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/utils/db.py", line 74, in wrapper
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery return func(*args, **kwargs)
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_task
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery result = task_copy.execute(context=context)
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execute
[2019-04-17 22:33:43,899] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self.py_file, self.py_options)
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflow
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery label_formatter)
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrapper
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery return func(self, *args, **kwargs)
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflow
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery job_id = _Dataflow(cmd).wait_for_done()
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_done
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery self._proc.returncode))
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Exception: DataFlow failed with return code 1
答案 0 :(得分:0)
解决方案是使用PyPi软件包选项将google-apitools==0.5.26
添加到Composer环境中。