我已经动态创建了subdag。一切正常,main_dag运行正常。它的PythonOperator函数被调用。但是Subdag中可调用的Python没有被调用。请帮助我。由于我是Airflow的新手,所以从不同来源获取并合并了此代码。
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
from copy import deepcopy
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2019, 12, 16),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def sub_dag_method_a():
"""
sub dag method a
"""
import pdb;pdb.set_trace()
print('Subdag method a')
return 'a'
def sub_dag_method_b():
"""
sub dag method a
"""
print('Subdag method b')
return 'b'
# sub dag arguments
def create_subdag(dag_parent, dag_id_child_prefix, db_name, dag_child_id, start_date, schedule_interval):
# dag params
# import pdb;pdb.set_trace()
dag_id_child = '%s.%s_%s' % (dag_parent,dag_child_id,dag_id_child_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_id=dag_id_child, schedule_interval=schedule_interval,
start_date=start_date, default_args=default_args_copy)
# operators
tid_check = 'dummy_task_start_%s' % dag_id_child_prefix
print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_id_child_prefix
print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
python_callable=sub_dag_method_a)
tid_check = 'save_to_es_fetch_from_db_%s' % dag_id_child_prefix
print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy,
provide_context=True,
python_callable=sub_dag_method_b)
tid_check = 'dummy_task_end_%s' % dag_id_child_prefix
print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag, default_args=default_args_copy)
method_start >> method_a
method_a >> method_b
method_b >> method_end
return subdag
# main default arguments
# main dag
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date=datetime(2019, 12, 16))
# hello_world
def hello_world():
"""
Hello world
"""
i=0
subdag = create_subdag('main_dag', str(i), 'db_name'+str(i), 'task_dag',
main_dag.start_date, main_dag.schedule_interval)
# import pdb;pdb.set_trace()
sd_op = SubDagOperator(task_id='task_dag_'+str(i), subdag=subdag, dag=main_dag)
return subdag
# main task
main_task = PythonOperator(task_id='main_task', python_callable=hello_world, dag=main_dag)
# hello_world()
通过运行命令输出
airflow test 'main_dag' 'main_task' 2019/12/16
是
(alphavu3711_1) Noamans-MacBook-Pro-2:python3 noamanfaisalbinbadar$ airflow test 'main_dag' 'main_task' 2019/12/16
[2019-12-16 21:56:10,312] {settings.py:252} INFO - settings.configure_orm(): Using pool settings. pool_size=5, max_overflow=10, pool_recycle=1800, pid=4100
[2019-12-16 21:56:11,119] {__init__.py:51} INFO - Using executor SequentialExecutor
[2019-12-16 21:56:11,119] {dagbag.py:92} INFO - Filling up the DagBag from /Users/noamanfaisalbinbadar/code/alphavu/production/python3/fb_messenger_airflow/dags
[2019-12-16 21:56:11,415] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:630} INFO - Dependencies all met for <TaskInstance: main_dag.main_task 2019-12-16T00:00:00+00:00 [success]>
[2019-12-16 21:56:11,433] {taskinstance.py:841} INFO -
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:842} INFO - Starting attempt 2 of 1
[2019-12-16 21:56:11,433] {taskinstance.py:843} INFO -
--------------------------------------------------------------------------------
[2019-12-16 21:56:11,433] {taskinstance.py:862} INFO - Executing <Task(PythonOperator): main_task> on 2019-12-16T00:00:00+00:00
[2019-12-16 21:56:11,455] {python_operator.py:105} INFO - Exporting the following env vars:
AIRFLOW_CTX_DAG_ID=main_dag
AIRFLOW_CTX_TASK_ID=main_task
AIRFLOW_CTX_EXECUTION_DATE=2019-12-16T00:00:00+00:00
AIRFLOW_CTX_DAG_RUN_ID=scheduled__2019-12-16T00:00:00+00:00
dummy_task_start_0
get_from_facebook_and_save_to_db_0
save_to_es_fetch_from_db_0
dummy_task_end_0
[2019-12-16 21:56:11,459] {python_operator.py:114} INFO - Done. Returned value was: <DAG: main_dag.task_dag_0>
回答后的新方法是这样
from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
# dag params
# import pdb;pdb.set_trace()
dag_name_processed = '%s_%s' % (dag_name, dag_name_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
default_args=deepcopy(main_default_args))
def sub_dag_method_a(**kwargs):
"""
sub dag method a
"""
print('Subdag method a')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'a'
def sub_dag_method_b(**kwargs):
"""
sub dag method a
"""
print('Subdag method b')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'b'
with subdag:
# operators
tid_check = 'dummy_task_start_%s' % dag_name_prefix
# print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag)
# new tid
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
# print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
# print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'dummy_task_end_%s' % dag_name_prefix
# print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag)
# dependencies
method_start >> method_a
method_a >> method_b
method_b >> method_end
# return subdag
return subdag
start_date_ = datetime.now() + timedelta(minutes=-1)
# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
print(conversation_info)
i = conversation_info['id']
subdag_name = 'main_dag'
sub_dag = create_subdag(subdag_name, str(i), start_date_, timedelta(minutes=2), conversation_info)
print(sub_dag)
但是我什至无法创建多个Dags
答案 0 :(得分:1)
不可能用另一个运算符的“执行”方法动态创建SubDAG。这实际上是您尝试实现的目标。
在解析python代码并构造python文件顶层中可用的对象时,会创建DAG及其依赖项(包括SubDag)。在这种情况下,它将创建DAG并将其分配给main_dag变量,然后创建PythonOperator并将其分配给main_task。这就是在调度过程中发生的所有事情。然后不调用PythonOperator可调用对象。
执行任务并调用可调用对象-创建DAG已经为时已晚。到那时,所有DAG结构和依赖项都已创建,并且调度已完成。
基本上,您只能在计划程序中创建新的DAG(包括SubDAG)-计划程序解析所有python代码并创建DAG及其任务。然后,当特定任务(例如您提到的PythonOperator)的时间和依赖关系在其中一个Workers中(而不在Scheduler中)执行时,即使它们创建了DAGS,它也不会影响调度程序,并且永远不会调度已创建的DAG,从而执行它们
答案 1 :(得分:0)
我认为,您正在尝试根据对话信息动态创建subdag。我在您更新的代码中发现了几个问题
下面的代码对我有用
from fb_messenger.airflow_helpers.get_conversation_ids_page_wise import GetConversationIdsPageWise
from fb_messenger.airflow_helpers.get_conversation_messages_info import GetConversationMessagesInfo
from fb_messenger.airflow_helpers.save_to_es import SaveToES
from copy import deepcopy
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
import airflow
main_default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
}
def create_subdag(dag_name, dag_name_prefix, start_date, schedule_interval, conversation_info):
# dag params
# import pdb;pdb.set_trace()
dag_name_processed = '%s.%s' % (dag_name, dag_name_prefix)
# main default
default_args_copy = deepcopy(main_default_args)
subdag = DAG(dag_name_processed, schedule_interval=schedule_interval, start_date=start_date,
default_args=deepcopy(main_default_args))
def sub_dag_method_a(**kwargs):
"""
sub dag method a
"""
print('Subdag method a')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'a'
def sub_dag_method_b(**kwargs):
"""
sub dag method a
"""
print('Subdag method b')
print(kwargs['conversation_id'])
print(kwargs['updated_time'])
return 'b'
with subdag:
# operators
tid_check = 'dummy_task_start_%s' % dag_name_prefix
# print(tid_check)
method_start = DummyOperator(task_id=tid_check, dag=subdag)
# new tid
tid_check = 'get_from_facebook_and_save_to_db_%s' % dag_name_prefix
# print(tid_check)
method_a = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_a,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time':conversation_info['updated_time']})
# new tid
tid_check = 'save_to_es_fetch_from_db_%s' % dag_name_prefix
# print(tid_check)
method_b = PythonOperator(task_id=tid_check, dag=subdag, python_callable=sub_dag_method_b,
op_kwargs={'conversation_id':conversation_info['id'],
'updated_time': conversation_info['updated_time']})
# new tid
tid_check = 'dummy_task_end_%s' % dag_name_prefix
# print(tid_check)
method_end = DummyOperator(task_id=tid_check, dag=subdag)
# dependencies
method_start >> method_a
method_a >> method_b
method_b >> method_end
# return subdag
return subdag
sd = datetime.now()
main_dag = DAG('main_dag', default_args=deepcopy(main_default_args), schedule_interval=timedelta(hours=1),
start_date = sd)
# getting list of dictionaries
conversation_infos = GetConversationIdsPageWise().get_all()
print(conversation_infos)
print(len(conversation_infos))
for conversation_info in conversation_infos:
print(conversation_info)
i = conversation_info['id']
subdag_name = 'main_dag'
t_sub_dag = SubDagOperator(
subdag=create_subdag(subdag_name, str(i), sd, timedelta(minutes=2), conversation_info),
task_id=str(i),
dag=main_dag
)