气流DAG不能与Dag-Bag配合使用:动态任务

时间:2020-04-21 16:14:33

标签: airflow

这两天来我一直在努力,以创建一个新的DAG,该DAG会根据运行时生成的列表动态生成任务。我能够成功构建DAG,并将其同步化,其中生成的任务是从存储DAG的气流模块中的列表构建的。此后,我将列表从存储在气流模块中的列表更改为在运行时从sftp服务器检索文本文件生成的列表。

当我在以下代码中使用列表时,我能够使DAG成功同步,但是现在我已将其更改为从run_main()动态构建,因此DAG将不会同步。

全天都会编辑此问题,以试图阐明更多信息。只是对于Airflow还是很新,所以我知道我在这里不是很清楚。

import boto3
import os
import pkg_resources
import math
import sys
from airflow import DAG
from airflow.contrib.sensors.sftp_sensor import SFTPSensor
from airflow.operators.python_operator import PythonOperator
from airflow.hooks.base_hook import BaseHook
from datetime import datetime, timedelta, date
from warp_pipelines.dsg import data_transfer, sftp_file_rename
from warp_pipelines.utils import utils
from warp.api_wrappers import ftp

args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 1, 8),
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(dag_id='test_rev_transfer',
        default_args=args,
        schedule_interval="0 10 * * *",
        catchup=False
        )

#build last file path
folder_path= '/test_sftp/'
file_date= "{{ execution_date.strftime('%y%m%d') }}"
full_path= '{}360i_{}.txt'.format(folder_path, file_date)

#return item to build out load dsg_data tasks
def run_main():
    """

    """
    folder_path= '/test_sftp/'
    file_date= "{{ execution_date.strftime('%y%m%d') }}"
    full_path= '{}test_{}.txt'.format(folder_path, file_date)
    sa = utils.load_yaml(pkg_resources.resource_filename('warp_pipelines',
                                                         'test/sftp_rev.yaml')
                       )
    #build last file path
    sftp_conn= ftp.conn(host= sa['host'],
                        user= sa['user'],
                        con_type='sftp',
                        password= sa['pwd']
                        )

    sftp= ftp.FTPWrap(sftp_conn)

    try:
        rev_df= sftp.get_df(full_path,
                            delimiter='\t')
    except:
        sys.exit()

    bool_df = rev_df[rev_df['Orders'] > 0]

    rt = math.ceil((bool_df.shape[0]/10000))
    task_list = []
    for i in range(0,rt):
        if i == 0:
            value1 = 0
        else:
            value1 = i*10000
        value2=(i*10000)+10000
        task_list.append([value1, value2])

    return task_list


#custom function for sns
def run_aws_sns_topic():
     connection = BaseHook.get_connection("aws_global")
     aws_access_key_id = connection.login
     aws_secret_access_key = connection.password

     client = boto3.client('sns',
                      region_name='us-east-1',
                      aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key
                      )
     response = client.publish(
     TopicArn='arn:aws:sns:us-east-1:288219908733:test_revenue_transfer',
     Message='Data load success.',
     Subject='test_data_load {}'.format((date.today()-timedelta(days=1)).strftime('%B %d %Y'))
     )


sftp_sensor= SFTPSensor(
    task_id= 'test_rev_sensor',
    path=full_path,
    sftp_conn_id= 'test_rev_conn',
    poke_interval= 600,
    mode= 'reschedule',
    dag=dag
)


sftp1 = PythonOperator(
    task_id = 'sftp_file_move',
    python_callable=sftp_file_rename.run_main,
    dag=dag,
    provide_context=True
)


sns1 = PythonOperator(task_id='sns_success_send',
                      python_callable=run_aws_sns_topic,
                      dag=dag)


task_create = run_main()


for slice_list in task_create:
    s1 = PythonOperator(
        task_id = 'load_test_data_{}_{}'.format(slice_list[0], slice_list[1]),
        python_callable=data_transfer.run_main,
        dag=dag,
        provide_context=True,
        op_kwargs= {'slice': slice_list}
        )
    sftp_sensor >> s1 >> sftp1 >> sns1

0 个答案:

没有答案