使用循环在气流中创建多个任务

时间:2020-05-10 22:44:29

标签: airflow

我想创建将更新列行并为数据表中的每一行发送邮件的任务。目前,我创建了从主表下载数据的任务。我无法为临时数据表中的每一行创建任务。您能告诉我我做错了什么以及如何在lopp中生成和运行任务吗?

from datetime import datetime, timedelta

import airflow
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator

from airflow.contrib.operators.bigquery_get_data import BigQueryGetDataOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryValueCheckOperator
from airflow.operators import PythonOperator
from airflow.operators.python_operator import PythonOperator

default_args = {
    'owner': 'cmap',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}


with DAG('dq_bigquery_test',
         max_active_runs=1,
         schedule_interval='@once',
         catchup=False,
         default_args=default_args) as dag:

    query = "SELECT * from `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` where MailRequired = false"
    insert = "INSERT into dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc (DataTimeStamp, Robot, Status) Values (CURRENT_TIMESTAMP(), 'TestRobot', 'Test')"

    my_bq_task = BigQueryOperator(
                    task_id='query_exc_on_teste',
                    sql=query,
                    write_disposition='WRITE_TRUNCATE',
                    create_disposition='CREATE_IF_NEEDED',
                    bigquery_conn_id='google_cloud_dbce_bi_prod',
                    use_legacy_sql=False,
                    destination_dataset_table='dev_dataquality.testTable')



    get_data = BigQueryGetDataOperator(
        task_id='get_data_from_query',
        project_id='dbce-bi-prod-e6fd',
        dataset_id='dev_dataquality',
        table_id='testTable',
        max_results='100',
        selected_fields='Robot,Status,MailRequired',
        bigquery_conn_id='google_cloud_dbce_bi_prod'
        )

    def process_data_from_bq(**kwargs):


        ti = kwargs['ti']
        update_column = []
        bq_data = ti.xcom_pull(task_ids='get_data_from_query')
        print(bq_data)
        # Now bq_data here would have your data in Python list
        for index, i in enumerate(bq_data):


            update_query = "UPDATE `dbce-bi-prod-e6fd.dev_dataquality.data_logging_inc` SET MailSent = True WHERE Robot = '{}'".format(i[0])

            print(update_query)
            update_column.append(BigQueryOperator(
                    task_id='update_column_{}'.format(index),
                    sql=update_query,
                    write_disposition='WRITE_EMPTY',
                    create_disposition='CREATE_IF_NEEDED',
                    bigquery_conn_id='google_cloud_dbce_bi_prod',
                    use_legacy_sql=False,
                    dag=dag
                    ))
            if index not in [0]:
                update_column[index-1] >> update_column[index]                    


    process_data = PythonOperator(
        task_id='process_data_from_bq',
        python_callable=process_data_from_bq,
        provide_context=True
        )



    my_bq_task >> get_data >> process_data

谢谢您的帮助!

0 个答案:

没有答案