在Airflow的动态subdag中未调用Python运算符

时间:2019-12-16 17:02:53

标签: python-3.x airflow airflow-scheduler airflow-operator

我已经动态创建了subdag。一切正常,main_dag运行正常。它的PythonOperator函数被调用。但是Subdag中可调用的Python没有被调用。请帮助我。由于我是Airflow的新手,所以从不同来源获取并合并了此代码。

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
from copy import deepcopy
import airflow

main_default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 12, 16),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
}

def sub_dag_method_a():
    """
    sub dag method a
    """
    import pdb;pdb.set_trace()
    print('Subdag method a')
    return 'a'

def sub_dag_method_b():
    """
    sub dag method a
    """
    print('Subdag method b')
    return 'b'

# sub dag arguments
def create_subdag(dag_parent, dag_id_child_prefix, db_name, dag_child_id, start_date, schedule_interval):
    # dag params

    # import pdb;pdb.set_trace()
    dag_id_child = '%s.%s_%s' % (dag_parent,dag_child_id,dag_id_child_prefix)
    # main default
    default_args_copy = deepcopy(main_default_args)
    subdag = DAG(dag_id=dag_id_child, schedule_interval=schedule_interval,
    start_date=start_date, default_args=default_args_copy)
    # operators
    tid_check = 'dummy_task_start_%s' % dag_id_child_prefix
    print(tid_check)
    method_start = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)

    tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_id_child_prefix
    print(tid_check)

    method_a = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
                                 python_callable=sub_dag_method_a)

    tid_check = 'save_to_es_fetch_from_db_%s' % dag_id_child_prefix
    print(tid_check)
    method_b = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
                              provide_context=True,
                                 python_callable=sub_dag_method_b)

    tid_check = 'dummy_task_end_%s' % dag_id_child_prefix
    print(tid_check)
    method_end = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)

    method_start >> method_a
    method_a >> method_b
    method_b >> method_end

    return subdag

# main default arguments
# main dag
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date=datetime(2019, 12, 16))

# hello_world
def hello_world():
    """
    Hello world
    """
    i=0
    subdag = create_subdag('main_dag', str(i), 'db_name'+str(i), 'task_dag',
    main_dag.start_date, main_dag.schedule_interval)
        # import pdb;pdb.set_trace()
    sd_op = SubDagOperator(task_id='task_dag_'+str(i), subdag=subdag, dag=main_dag)
    return subdag


# main task
main_task = PythonOperator(task_id='main_task', python_callable=hello_world, dag=main_dag)
# hello_world()
通过运行命令

输出

airflow test 'main_dag' 'main_task' 2019/12/16

(alphavu3711_1) Noamans-MacBook-Pro-2:python3 noamanfaisalbinbadar$ airflow test 'main_dag' 'main_task' 2019/12/16
[2019-12-16 21:56:10,312] {settings.py:252} INFO - settings.configure_orm(): Using pool settings. pool_size=5, max_overflow=10, pool_recycle=1800, pid=4100
[2019-12-16 21:56:11,119] {__init__.py:51} INFO - Using executor SequentialExecutor
[2019-12-16 21:56:11,119] {dagbag.py:92} INFO - Filling up the DagBag from /Users/noamanfaisalbinbadar/code/alphavu/production/python3/fb_messenger_airflow/dags
[2019-12-16 21:56:11,415] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:841} INFO - 
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:842} INFO - Starting attempt 2 of 1
[2019-12-16 21:56:11,433] {taskinstance.py:843} INFO - 
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:862} INFO - Executing <Task(PythonOperator): main_task> on 2019-12-16T00:00:00+00:00
[2019-12-16 21:56:11,455] {python_operator.py:105} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_ID=main_dag
AIRFLOW_CTX_TASK_ID=main_task
AIRFLOW_CTX_EXECUTION_DATE=2019-12-16T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=scheduled__2019-12-16T00:00:00+00:00
dummy_task_start_0
get_from_facebook_and_save_to_db_0
save_to_es_fetch_from_db_0
dummy_task_end_0
[2019-12-16 21:56:11,459] {python_operator.py:114} INFO - Done. Returned value was: <DAG: main_dag.task_dag_0>

回答后的新方法是这样

from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow


main_default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
}

def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
    # dag params
    # import pdb;pdb.set_trace()
    dag_name_processed = '%s_%s' % (dag_name, dag_name_prefix)
    # main default
    default_args_copy = deepcopy(main_default_args)
    subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
                 default_args=deepcopy(main_default_args))
    def sub_dag_method_a(**kwargs):
        """
        sub dag method a
        """
        print('Subdag method a')
        print(kwargs['conversation_id'])
        print(kwargs['updated_time'])
        return 'a'

    def sub_dag_method_b(**kwargs):
        """
        sub dag method a
        """
        print('Subdag method b')
        print(kwargs['conversation_id'])
        print(kwargs['updated_time'])
        return 'b'

    with subdag:
    # operators
        tid_check = 'dummy_task_start_%s' % dag_name_prefix
        # print(tid_check)
        method_start = DummyOperator(task_id=tid_check, dag=subdag)
        # new tid
        tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
        # print(tid_check)
        method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
                                op_kwargs={'conversation_id':conversation_info['id'], 
                                'updated_time':conversation_info['updated_time']})
        # new tid
        tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
        # print(tid_check)
        method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
                                op_kwargs={'conversation_id':conversation_info['id'], 
                                'updated_time':conversation_info['updated_time']})
        # new tid
        tid_check = 'dummy_task_end_%s' % dag_name_prefix
        # print(tid_check)
        method_end = DummyOperator(task_id=tid_check, dag=subdag)
        # dependencies
        method_start >> method_a
        method_a >> method_b
        method_b >> method_end
    # return subdag
    return subdag

start_date_ = datetime.now() + timedelta(minutes=-1)
# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
    print(conversation_info)
    i = conversation_info['id']
    subdag_name = 'main_dag'
    sub_dag = create_subdag(subdag_name, str(i), start_date_, timedelta(minutes=2), conversation_info)
    print(sub_dag)


但是我什至无法创建多个Dags

2 个答案:

答案 0 :(得分:1)

不可能用另一个运算符的“执行”方法动态创建SubDAG。这实际上是您尝试实现的目标。

在解析python代码并构造python文件顶层中可用的对象时,会创建

DAG及其依赖项(包括SubDag)。在这种情况下,它将创建DAG并将其分配给main_dag变量,然后创建PythonOperator并将其分配给main_task。这就是在调度过程中发生的所有事情。然后不调用PythonOperator可调用对象。

执行任务并调用可调用对象-创建DAG已经为时已晚。到那时,所有DAG结构和依赖项都已创建,并且调度已完成。

基本上,您只能在计划程序中创建新的DAG(包括SubDAG)-计划程序解析所有python代码并创建DAG及其任务。然后,当特定任务(例如您提到的PythonOperator)的时间和依赖关系在其中一个Workers中(而不在Scheduler中)执行时,即使它们创建了DAGS,它也不会影响调度程序,并且永远不会调度已创建的DAG,从而执行它们

答案 1 :(得分:0)

我认为,您正在尝试根据对话信息动态创建subdag。我在您更新的代码中发现了几个问题

  1. 它应该有一个主要的dag对象,需要将其传递给subdag函数。
  2. 需要使用代码中缺少的subdag运算符来调用subdag函数。
  3. Subdag名称需要匹配“ parent_dag_name”。“ child_dag_name”模式,而不是“ parent_dag_name” __“ child_dag_name”

下面的代码对我有用

from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow


main_default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
}

def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
    # dag params
    # import pdb;pdb.set_trace()
    dag_name_processed = '%s.%s' % (dag_name, dag_name_prefix)
    # main default
    default_args_copy = deepcopy(main_default_args)
    subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
                 default_args=deepcopy(main_default_args))
    def sub_dag_method_a(**kwargs):
        """
        sub dag method a
        """
        print('Subdag method a')
        print(kwargs['conversation_id'])
        print(kwargs['updated_time'])
        return 'a'

    def sub_dag_method_b(**kwargs):
        """
        sub dag method a
        """
        print('Subdag method b')
        print(kwargs['conversation_id'])
        print(kwargs['updated_time'])
        return 'b'

    with subdag:
    # operators
        tid_check = 'dummy_task_start_%s' % dag_name_prefix
        # print(tid_check)
        method_start = DummyOperator(task_id=tid_check, dag=subdag)
        # new tid
        tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
        # print(tid_check)
        method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
                                op_kwargs={'conversation_id':conversation_info['id'], 
                                'updated_time':conversation_info['updated_time']})
        # new tid
        tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
        # print(tid_check)
        method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
                                op_kwargs={'conversation_id':conversation_info['id'], 
                                    'updated_time': conversation_info['updated_time']})
        # new tid
        tid_check = 'dummy_task_end_%s' % dag_name_prefix
        # print(tid_check)
        method_end = DummyOperator(task_id=tid_check, dag=subdag)
        # dependencies
        method_start >> method_a
        method_a >> method_b
        method_b >> method_end
    # return subdag
    return subdag

sd = datetime.now()
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date = sd)

# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
    print(conversation_info)
    i = conversation_info['id']
    subdag_name = 'main_dag'

    t_sub_dag = SubDagOperator(
            subdag=create_subdag(subdag_name, str(i), sd, timedelta(minutes=2), conversation_info),
            task_id=str(i),
            dag=main_dag
            )