我是气流的新手,想在循环中运行一堆任务,但是我面临循环错误。
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 4, 13),
'email': ['raff@abc.com', 'raffg@abc.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
dag = DAG('sparktestingforstandalone',
schedule_interval='@yearly',
default_args=default_args,
catchup=False
)
sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '
start_op = DummyOperator(task_id='start_spark_runs',dag=dag)
t1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase',
command=linux_command_1,
dag=dag)
t1_1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_1',
command=linux_command_1,
dag=dag)
t2 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_3',
command=linux_command_2,
dag=dag)
t2_1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_12',
command=linux_command_2,
dag=dag)
t3 = SSHOperator(
ssh_hook=sshHook,
task_id='Join_useCase',
command=linux_command_3,
dag=dag)
t3_1 = SSHOperator(
ssh_hook=sshHook,
task_id='Join_useCase_1',
command=linux_command_3,
dag=dag)
t4 = SSHOperator(
ssh_hook=sshHook,
task_id='Denoramlize_usecase',
command=linux_command_5,
dag=dag)
t5 = SSHOperator(
ssh_hook=sshHook,
task_id='1798_useCase',
command=linux_command_5,
dag=dag)
t6 = SSHOperator(
ssh_hook=sshHook,
task_id='Json_Complex_Creation',
command=linux_command_6,
dag=dag)
t7 = SSHOperator(
ssh_hook=sshHook,
task_id='DB_to_DB_Creation',
command=linux_command_7,
dag=dag)
s1 = PythonOperator(task_id="delay_sleep_task_30sec",
dag=dag,
python_callable=lambda: time.sleep(30))
s1 = PythonOperator(task_id="delay_sleep_task_30sec_1",
dag=dag,
python_callable=lambda: time.sleep(30))
s2 = PythonOperator(task_id="delay_sleep_task_30sec_2",
dag=dag,
python_callable=lambda: time.sleep(30))
s3 = PythonOperator(task_id="delay_sleep_task_30sec_3",
dag=dag,
python_callable=lambda: time.sleep(30))
s4 = PythonOperator(task_id="delay_sleep_task_30sec_4",
dag=dag,
python_callable=lambda: time.sleep(30))
s5 = PythonOperator(task_id="delay_sleep_task_30sec_5",
dag=dag,
python_callable=lambda: time.sleep(30))
s6 = PythonOperator(task_id="delay_sleep_task_30sec_6",
dag=dag,
python_callable=lambda: time.sleep(30))
s7 = PythonOperator(task_id="delay_sleep_task_30sec_7",
dag=dag,
python_callable=lambda: time.sleep(30))
s8 = PythonOperator(task_id="delay_sleep_task_30sec_8",
dag=dag,
python_callable=lambda: time.sleep(30))
s9 = PythonOperator(task_id="delay_sleep_task_30sec_9",
dag=dag,
python_callable=lambda: time.sleep(30))
s10 = PythonOperator(task_id="delay_sleep_task_30sec_10",
dag=dag,
python_callable=lambda: time.sleep(30))
s11 = PythonOperator(task_id="delay_sleep_task_30sec_11",
dag=dag,
python_callable=lambda: time.sleep(30))
s12 = PythonOperator(task_id="delay_sleep_task_30sec_12",
dag=dag,
python_callable=lambda: time.sleep(30))
end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
start_op >> t1 >> t1_1 >> end_op
start_op >> t2 >> t2_1 >> end_op
start_op >> t5 >> end_op
start_op >> t7 >> s1 >> t7 >> s2 >> t7 >> s3 >> end_op
start_op >> [t3,t4,t6] >> s4 >> [t3,t4,t6] >> s5 >> [t3,t4,t6] >> s6 >> [t3,t4,t6] >> s7 >> [t3,t4,t6] >> s8 >> [t3,t4,t6] >> s9 >> [t3,t4,t6] >> s10 >> [t3,t4,t6] >> s11 >> [t3,t4,t6] >> s12 >> end_op
我知道这很麻烦,有没有一种优雅的方法来实现它。
我想在循环中并行运行 t3、t4、t6 任务 n 次,并在每次运行之间休眠 30 秒。 还有多个其他任务,如 t7 也将被触发。
我想一次触发几个任务,在一个 dag 中多次触发几个任务,我不想像我在这里所做的那样创建那么多的实例,需要像提到的那样以优雅的方式来做。
答案 0 :(得分:0)
您不能在 DAG Airflow 中创建循环,根据定义,DAG 是一个 Directed Acylic Graph。
但是您可以使用 TriggerDagRunOperator。这将触发您定义的 DAG 的 DagRun。
def dag_run_payload(context, dag_run_obj):
# You can add the data of dag_run.conf in here
# use your context information and add it to the
# dag_run_obj.payload
dag_run_obj.payload = {}
trigger_next_iter = TriggerDagRunOperator(
dag=dag,
task_id='trigger_next_iter',
trigger_dag_id='sparktestingforstandalone', # Or any other DAG
execution_date="{{ ti.xcom_pull(...) }}", # Its templated
python_executable=dag_run_payload
)
end_op >> trigger_next_iter
您可以在 DAG 末尾附加触发器。
注意:这不适用于 Airflow 2。TriggerDagRun 在以后的版本中发生了变化,它不提供 python_executable,但您仍然可以指定 dag_run 配置。
也许您需要查看 Airflow 的其他功能来实现您的目标。
on_success_callback
的运算符将使您的 DAG 不那么混乱。您可以在此处添加您的 lambda: time.sleep(30)
。[t3, t4, t6]
任务分组。并在等待时间中添加 on_success_callback
。def subdag(parent_dag_name, child_dag_name, args):
# Your SubDag definition here.
section_1 = SubDagOperator(
task_id='section-1',
subdag=subdag(DAG_NAME, 'section-1', args),
dag=dag,
on_success_callback=lambda: time.sleep(30)
)
答案 1 :(得分:0)
我确实使用了 for 循环生成任务名称并将其附加到列表中,一个接一个地运行几个任务 n、n+3、n+2 和 n+10 次 - 解决方案刚刚扩展,如在 Airflow rerun a single task multiple times on success
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 4, 13),
'email': ['raff@abc.com', 'raffg@abc.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
dag = DAG('sparktestingforstandalone',
schedule_interval='@yearly',
default_args=default_args,
catchup=False
)
sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '
start_op = DummyOperator(task_id='start_spark_runs',dag=dag)
t5 = SSHOperator(
ssh_hook=sshHook,
task_id='nonloop_usecase',
command=linux_command_5,
dag=dag)
chain_operators = []
chain_operators.append(start_op)
chain_operators_1 = []
chain_operators_1.append(start_op)
chain_operators_2 = []
chain_operators_2.append(start_op)
chain_operators_3 = []
chain_operators_3.append(start_op)
chain_operators_4 = []
chain_operators_4.append(start_op)
chain_operators_5 = []
chain_operators_5.append(start_op)
max_attempt = 10
for attempt in range(max_attempt):
data_pull = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_2_{}'.format(attempt),
command=linux_command_3,
dag=dag
)
data_pull_2 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_1_{}'.format(attempt),
command=linux_command_4,
dag=dag
)
data_pull_3 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_1{}'.format(attempt),
command=linux_command_6,
dag=dag
)
chain_operators.append(data_pull)
chain_operators_1.append(data_pull_2)
chain_operators_2.append(data_pull_3)
max_attempt_1 = 2
for attempt in range(max_attempt_1):
data_pull_4 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runtwice_task_2_{}'.format(attempt),
command=linux_command_1,
dag=dag
)
data_pull_5 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runtwice_task_1_{}'.format(attempt),
command=linux_command_2,
dag=dag
)
chain_operators_3.append(data_pull_4)
chain_operators_4.append(data_pull_5)
max_attempt_2 = 3
for attempt in range(max_attempt_2):
data_pull_6 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runthrice_{}'.format(attempt),
command=linux_command_7,
dag=dag
)
chain_operators_5.append(data_pull_6)
end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
chain_operators_1.append(end_op)
chain_operators_2.append(end_op)
chain_operators_3.append(end_op)
chain_operators_4.append(end_op)
chain_operators_5.append(end_op)
chain_operators.append(end_op)
for i,val in enumerate(chain_operators[:-1]):
val.set_downstream(chain_operators[i+1])
for j,val in enumerate(chain_operators_1[:-1]):
val.set_downstream(chain_operators_1[j+1])
for k,val in enumerate(chain_operators_2[:-1]):
val.set_downstream(chain_operators_2[k+1])
start_op >> t5 >> end_op
start_op >> t7 >> end_op
for l,val in enumerate(chain_operators_3[:-1]):
val.set_downstream(chain_operators_3[l+1])
for m,val in enumerate(chain_operators_4[:-1]):
val.set_downstream(chain_operators_4[m+1])
for n,val in enumerate(chain_operators_5[:-1]):
val.set_downstream(chain_operators_5[n+1])