在Cloud Composer上为Dataflow Operator指定Beam版本

时间:2019-04-17 20:51:42

标签: google-cloud-dataflow apache-beam google-cloud-composer

我们已经为版本2.11编写了Beam管道,但是当我们尝试使用DataflowOperator在Cloud Composer上运行它时,它使用的是SDK版本2.5。

在任何地方都可以指定使用2.11吗?

管道:

import argparse
import apache_beam as beam
from apache_beam.io.gcp import gcsio
from apache_beam.options.pipeline_options import PipelineOptions
import logging
from google.cloud import storage
import numpy as np
import pandas as pd
from sp_vcf import VCF

GCS_PREFIX = 'gs://'


def run(argv=None):
    """
    Create and run Dataflow pipeline.
    :return: none
    """

    parser = argparse.ArgumentParser()

    # Add the arguments needed for this specific Dataflow job.
    parser.add_argument('--gvcf_bucket', dest='gvcf_bucket', required=True,


    parser.add_argument('--parquet_bucket', dest='parquet_bucket', required=True,
                        help='Bucket on Google Cloud Storage to write parquet files to.')

    parser.add_argument('--destination_table', dest='destination_table', required=True,
                        help='BigQuery table where transformed gvcfs should land')

    parser.add_argument('--bq_dataset', dest='bq_dataset', required=True,
                        help='BigQuery dataset where destination table lives')

    known_args, pipeline_args = parser.parse_known_args(argv)

    # Add argument so that declared constants (ie, GCS_PREFIX)
    # are available to Dataflow workers
    pipeline_args.append('--save_main_session')

    # Set options necessary for pipeline such as runner, project, region
    p_opts = PipelineOptions(pipeline_args)

    # Create and run beam pipeline object
    with beam.Pipeline(options=p_opts) as p:

        # Sink info
        gvcf_bucket = known_args.gvcf_bucket
        parquet_sink = known_args.parquet_bucket

        # Set BigQuery Table spec for beam.io
        # format is: dataset.table
        table_spec = '{}.{}'.format(known_args.bq_dataset, known_args.destination_table)

        # Get files to transform
        files = get_files_to_transform(gvcf_bucket)

        if files:

            logging.info("Found {} files to transform".format((len(files))))

            # Create pcollection of list of files to transform
            gvcfs_to_transform = p | 'GetFiles' >> beam.Create(files)

            # Read gvcfs from gcs into pcollection
            parquets_to_load = gvcfs_to_transform | 'GvcfToParquet' >> beam.ParDo(GvcfToParquet(),
                                                                                  gvcf_bucket,
                                                                                  parquet_sink)

            # Read Parquet files into pcollection
            records = parquets_to_load | 'ReadParquet' >> beam.io.ReadAllFromParquet()

            # Load all Parquet files into BigQuery
            records | 'WriteParquetToBigQuery' >> beam.io.WriteToBigQuery(table_spec,
                                                                           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        else:
            logging.info("No new files found")


if __name__ == '__main__':
    run()

作曲家

import datetime
import os

from airflow import models, configuration
from airflow.operators import subdag_operator, dummy_operator, bash_operator
from airflow.contrib.operators import dataflow_operator
import googleapiclient.discovery
import json
from computation_query_dag import computation_dag

yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

DEFAULT_DAG_ARGS = {
    'start_date': yesterday,
    'retries': 0,
    'project_id': models.Variable.get('gcp_project'),
    'dataflow_default_options': {
        'project': models.Variable.get('gcp_project'),
        'temp_location': models.Variable.get('gcp_temp_location'),
        'staging_location': models.Variable.get('gcp_staging_location'),
        'runner': 'DataflowRunner',
        # 'region': 'us-central1',
    },
}

with models.DAG(dag_id='TestEngine',
                description='A DAG for allele analytics workflow',
                schedule_interval=None, default_args=DEFAULT_DAG_ARGS, start_date=yesterday) as dag:

    dataflow_scripts = os.path.join(configuration.get('core', 'dags_folder'), 'pipeline')

    # Args required for the ETL Dataflow job.
    gvcf_dataflow_job_args = {
        'gvcf_bucket': os.getenv('gvcf_bucket'),
        'parquet_bucket': os.getenv('parquet_bucket'),
        # 'job_name': os.getenv('gvcf_job_name'),
        #'setup_file': os.path.join(dataflow_scripts, 'setup.py'),
        'requirements_file': os.path.join(dataflow_scripts, 'requirements.txt'),
        #'extra_package': os.path.join(dataflow_scripts, 'sp_vcf.tar.gz'),
        'destination_table': os.getenv('call_table'),
        'bq_dataset': os.getenv('bq_dataset'),
        # 'py_file': os.path.join(dataflow_scripts, 'gvcf_pipeline.py')
    }

    # Dataflow task that will process and load.
    dataflow_gvcf = dataflow_operator.DataFlowPythonOperator(
        task_id="gvcf-etl-bigquery",
        py_file=os.path.join(dataflow_scripts, 'gvcf_pipeline.py'),
        # dataflow_default_options=DEFAULT_DAG_ARGS['dataflow_default_options'],
        options=gvcf_dataflow_job_args,
        # gcp_conn_id='google_cloud_default'
    )

由于我能够在本地运行管道,所以我认为,如果我们指定Beam版本,则从Cloud Composer运行时也可以使用。

我们已将2.11安装到Composer环境中,但出现以下错误:

*** Reading remote log from gs://us-central1-test-env-96162c22-bucket/logs/AlleleAnalyticsEngine/gvcf-etl-bigquery/2019-04-17T22:33:07.326577+00:00/1.log.
[2019-04-17 22:33:18,604] {models.py:1361} INFO - Dependencies all met for <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [queued]>
[2019-04-17 22:33:18,611] {models.py:1361} INFO - Dependencies all met for <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [queued]>
[2019-04-17 22:33:18,613] {models.py:1573} INFO -
-------------------------------------------------------------------------------
Starting attempt 1 of 
-------------------------------------------------------------------------------

[2019-04-17 22:33:18,659] {models.py:1595} INFO - Executing <Task(DataFlowPythonOperator): gvcf-etl-bigquery> on 2019-04-17T22:33:07.326577+00:00
[2019-04-17 22:33:18,660] {base_task_runner.py:118} INFO - Running: ['bash', '-c', u'airflow run AlleleAnalyticsEngine gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 --job_id 209 --raw -sd DAGS_FOLDER/main_dag.py --cfg_path /tmp/tmpGhGCxD']
[2019-04-17 22:33:20,148] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:20,147] {settings.py:176} INFO - setting.configure_orm(): Using pool settings. pool_size=5, pool_recycle=1800
[2019-04-17 22:33:21,073] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,072] {default_celery.py:80} WARNING - You have configured a result_backend of redis://airflow-redis-service.default.svc.cluster.local:6379/0, it is highly recommended to use an alternative result_backend (i.e. a database).
[2019-04-17 22:33:21,076] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,075] {__init__.py:51} INFO - Using executor CeleryExecutor
[2019-04-17 22:33:21,155] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,155] {app.py:51} WARNING - Using default Composer Environment Variables. Overrides have not been applied.
[2019-04-17 22:33:21,162] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,162] {configuration.py:516} INFO - Reading the config from /etc/airflow/airflow.cfg
[2019-04-17 22:33:21,174] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,174] {configuration.py:516} INFO - Reading the config from /etc/airflow/airflow.cfg
[2019-04-17 22:33:21,363] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:21,362] {models.py:271} INFO - Filling up the DagBag from /home/airflow/gcs/dags/main_dag.py
[2019-04-17 22:33:23,991] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:23,985] {cli.py:484} INFO - Running <TaskInstance: AlleleAnalyticsEngine.gvcf-etl-bigquery 2019-04-17T22:33:07.326577+00:00 [running]> on host airflow-worker-796dcd49fc-x7fx6
[2019-04-17 22:33:24,237] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:24,236] {gcp_dataflow_hook.py:120} INFO - Running command: python2 /home/airflow/gcs/dags/pipeline/gvcf_pipeline.py --runner=DataflowRunner --parquet_bucket=parquet_sink_test --runner=DataflowRunner --region=us-central1 --labels=airflow-version=v1-10-1-composer --destination_table=calls_table_test --project=genomics-207320 --bq_dataset=allele_analytics --gvcf_bucket=gvcf_sink_test --temp_location=gs://aa_dataflow_staging/temp --job_name=gvcf-etl-bigquery-cfc96be4
[2019-04-17 22:33:25,214] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:25,213] {gcp_dataflow_hook.py:151} INFO - Start waiting for DataFlow process to complete.
[2019-04-17 22:33:43,821] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,820] {gcp_dataflow_hook.py:132} WARNING - Traceback (most recent call last):
[2019-04-17 22:33:43,822] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/home/airflow/gcs/dags/pipeline/gvcf_pipeline.py", line 339, in <module>
[2019-04-17 22:33:43,822] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     run()
[2019-04-17 22:33:43,823] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/home/airflow/gcs/dags/pipeline/gvcf_pipeline.py", line 335, in run
[2019-04-17 22:33:43,823] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     logging.info("No new files found")
[2019-04-17 22:33:43,824] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 426, in __exit__
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self.run().wait_until_finish()
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 406, in run
[2019-04-17 22:33:43,825] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self._options).run(False)
[2019-04-17 22:33:43,827] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 419, in run
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     return self.runner.run_pipeline(self, self._options)
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 408, in run_pipeline
[2019-04-17 22:33:43,831] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self.dataflow_client = apiclient.DataflowApplicationClient(options)
[2019-04-17 22:33:43,832] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 445, in __init__
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     response_encoding=get_response_encoding())
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py", line 58, in __init__
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     response_encoding=response_encoding)
[2019-04-17 22:33:43,835] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery TypeError: __init__() got an unexpected keyword argument 'response_encoding'
[2019-04-17 22:33:43,832] {models.py:1760} ERROR - DataFlow failed with return code 1
Traceback (most recent call last)
  File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_tas
    result = task_copy.execute(context=context
  File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execut
    self.py_file, self.py_options
  File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflo
    label_formatter
  File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrappe
    return func(self, *args, **kwargs
  File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflo
    job_id = _Dataflow(cmd).wait_for_done(
  File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_don
    self._proc.returncode)
Exception: DataFlow failed with return code 
[2019-04-17 22:33:43,840] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,832] {models.py:1760} ERROR - DataFlow failed with return code 1
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Traceback (most recent call last):
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_task
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     result = task_copy.execute(context=context)
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execute
[2019-04-17 22:33:43,841] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self.py_file, self.py_options)
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflow
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     label_formatter)
[2019-04-17 22:33:43,843] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrapper
[2019-04-17 22:33:43,844] {models.py:1791} INFO - Marking task as FAILED.
[2019-04-17 22:33:43,844] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     return func(self, *args, **kwargs)
[2019-04-17 22:33:43,845] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflow
[2019-04-17 22:33:43,845] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     job_id = _Dataflow(cmd).wait_for_done()
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_done
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self._proc.returncode))
[2019-04-17 22:33:43,847] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Exception: DataFlow failed with return code 1
[2019-04-17 22:33:43,848] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery [2019-04-17 22:33:43,844] {models.py:1791} INFO - Marking task as FAILED.
[2019-04-17 22:33:43,890] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Traceback (most recent call last):
[2019-04-17 22:33:43,891] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/bin/airflow", line 7, in <module>
[2019-04-17 22:33:43,891] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     exec(compile(f.read(), __file__, 'exec'))
[2019-04-17 22:33:43,892] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/bin/airflow", line 32, in <module>
[2019-04-17 22:33:43,892] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     args.func(args)
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/utils/cli.py", line 74, in wrapper
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     return f(*args, **kwargs)
[2019-04-17 22:33:43,893] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/bin/cli.py", line 490, in run
[2019-04-17 22:33:43,894] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     _run(args, dag, ti)
[2019-04-17 22:33:43,894] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/bin/cli.py", line 406, in _run
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     pool=args.pool,
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/utils/db.py", line 74, in wrapper
[2019-04-17 22:33:43,895] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     return func(*args, **kwargs)
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/models.py", line 1659, in _run_raw_task
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     result = task_copy.execute(context=context)
[2019-04-17 22:33:43,897] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/operators/dataflow_operator.py", line 332, in execute
[2019-04-17 22:33:43,899] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self.py_file, self.py_options)
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 241, in start_python_dataflow
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     label_formatter)
[2019-04-17 22:33:44,083] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 213, in wrapper
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     return func(self, *args, **kwargs)
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 199, in _start_dataflow
[2019-04-17 22:33:44,084] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     job_id = _Dataflow(cmd).wait_for_done()
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery   File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_dataflow_hook.py", line 172, in wait_for_done
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery     self._proc.returncode))
[2019-04-17 22:33:44,085] {base_task_runner.py:101} INFO - Job 209: Subtask gvcf-etl-bigquery Exception: DataFlow failed with return code 1

1 个答案:

答案 0 :(得分:0)

解决方案是使用PyPi软件包选项将google-apitools==0.5.26添加到Composer环境中。