Question

我是气流的新手，想在循环中运行一堆任务，但是我面临循环错误。

from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2021, 4, 13),
    'email': ['raff@abc.com', 'raffg@abc.com'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('sparktestingforstandalone',
          schedule_interval='@yearly',
          default_args=default_args,
          catchup=False
          )

sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '

start_op = DummyOperator(task_id='start_spark_runs',dag=dag)

t1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase',
    command=linux_command_1,
    dag=dag)

t1_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_1',
    command=linux_command_1,
    dag=dag)

t2 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_3',
    command=linux_command_2,
    dag=dag)

t2_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_12',
    command=linux_command_2,
    dag=dag)

t3 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Join_useCase',
    command=linux_command_3,
    dag=dag)

t3_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Join_useCase_1',
    command=linux_command_3,
    dag=dag)

t4 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Denoramlize_usecase',
    command=linux_command_5,
    dag=dag)

t5 = SSHOperator(
    ssh_hook=sshHook,
    task_id='1798_useCase',
    command=linux_command_5,
    dag=dag)

t6 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Json_Complex_Creation',
    command=linux_command_6,
    dag=dag)

t7 = SSHOperator(
    ssh_hook=sshHook,
    task_id='DB_to_DB_Creation',
    command=linux_command_7,
    dag=dag)

s1 = PythonOperator(task_id="delay_sleep_task_30sec",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))

s1 = PythonOperator(task_id="delay_sleep_task_30sec_1",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s2 = PythonOperator(task_id="delay_sleep_task_30sec_2",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s3 = PythonOperator(task_id="delay_sleep_task_30sec_3",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s4 = PythonOperator(task_id="delay_sleep_task_30sec_4",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s5 = PythonOperator(task_id="delay_sleep_task_30sec_5",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s6 = PythonOperator(task_id="delay_sleep_task_30sec_6",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s7 = PythonOperator(task_id="delay_sleep_task_30sec_7",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s8 = PythonOperator(task_id="delay_sleep_task_30sec_8",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s9 = PythonOperator(task_id="delay_sleep_task_30sec_9",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s10 = PythonOperator(task_id="delay_sleep_task_30sec_10",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s11 = PythonOperator(task_id="delay_sleep_task_30sec_11",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s12 = PythonOperator(task_id="delay_sleep_task_30sec_12",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))


end_op = DummyOperator(task_id='end_spark_runs', dag=dag)

start_op >> t1 >> t1_1 >> end_op
start_op >> t2 >> t2_1 >> end_op
start_op >> t5 >> end_op
start_op >> t7 >> s1 >> t7 >> s2 >> t7 >> s3 >> end_op
start_op >> [t3,t4,t6] >> s4 >> [t3,t4,t6] >> s5 >> [t3,t4,t6] >> s6 >> [t3,t4,t6] >> s7 >> [t3,t4,t6] >> s8 >> [t3,t4,t6] >> s9 >> [t3,t4,t6] >> s10 >> [t3,t4,t6] >> s11 >> [t3,t4,t6] >> s12 >> end_op

我知道这很麻烦，有没有一种优雅的方法来实现它。

我想在循环中并行运行 t3、t4、t6 任务 n 次，并在每次运行之间休眠 30 秒。还有多个其他任务，如 t7 也将被触发。

我想一次触发几个任务，在一个 dag 中多次触发几个任务，我不想像我在这里所做的那样创建那么多的实例，需要像提到的那样以优雅的方式来做。

Answer 1

您不能在 DAG Airflow 中创建循环，根据定义，DAG 是一个 Directed Acylic Graph。

但是您可以使用 TriggerDagRunOperator。这将触发您定义的 DAG 的 DagRun。

def dag_run_payload(context, dag_run_obj):
    # You can add the data of dag_run.conf in here
    # use your context information and add it to the
    # dag_run_obj.payload
    dag_run_obj.payload = {}


trigger_next_iter = TriggerDagRunOperator(
     dag=dag,
     task_id='trigger_next_iter',
     trigger_dag_id='sparktestingforstandalone',  # Or any other DAG
     execution_date="{{ ti.xcom_pull(...) }}",  # Its templated
     python_executable=dag_run_payload
)

end_op >> trigger_next_iter

您可以在 DAG 末尾附加触发器。

注意：这不适用于 Airflow 2。TriggerDagRun 在以后的版本中发生了变化，它不提供 python_executable，但您仍然可以指定 dag_run 配置。

其他注意事项

也许您需要查看 Airflow 的其他功能来实现您的目标。

Using the on_success_callback 的运算符将使您的 DAG 不那么混乱。您可以在此处添加您的 lambda: time.sleep(30)。
喜欢创建SubDags。您可以将您的 [t3, t4, t6] 任务分组。并在等待时间中添加 on_success_callback。

def subdag(parent_dag_name, child_dag_name, args):
    # Your SubDag definition here.

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    dag=dag,
    on_success_callback=lambda: time.sleep(30)
)

在多个 DAG 中分离您当前的 DAG 逻辑，并利用 Airflow 的优势。就像创建池和限制 DAG 的 max_active_runs

Answer 2

我确实使用了 for 循环生成任务名称并将其附加到列表中，一个接一个地运行几个任务 n、n+3、n+2 和 n+10 次 - 解决方案刚刚扩展，如在 Airflow rerun a single task multiple times on success

from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2021, 4, 13),
    'email': ['raff@abc.com', 'raffg@abc.com'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('sparktestingforstandalone',
          schedule_interval='@yearly',
          default_args=default_args,
          catchup=False
          )

sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '

start_op = DummyOperator(task_id='start_spark_runs',dag=dag)

t5 = SSHOperator(
    ssh_hook=sshHook,
    task_id='nonloop_usecase',
    command=linux_command_5,
    dag=dag)

chain_operators = []
chain_operators.append(start_op)
chain_operators_1 = []
chain_operators_1.append(start_op)
chain_operators_2 = []
chain_operators_2.append(start_op)
chain_operators_3 = []
chain_operators_3.append(start_op)
chain_operators_4 = []
chain_operators_4.append(start_op)
chain_operators_5 = []
chain_operators_5.append(start_op)
max_attempt = 10
for attempt in range(max_attempt):
    data_pull = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_2_{}'.format(attempt),
        command=linux_command_3,
        dag=dag
    )
    data_pull_2 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_1_{}'.format(attempt),
        command=linux_command_4,
        dag=dag
    )
    data_pull_3 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_1{}'.format(attempt),
        command=linux_command_6,
        dag=dag
    )
    chain_operators.append(data_pull)
    chain_operators_1.append(data_pull_2)
    chain_operators_2.append(data_pull_3)
    

max_attempt_1 = 2
for attempt in range(max_attempt_1):
    data_pull_4 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runtwice_task_2_{}'.format(attempt),
        command=linux_command_1,
        dag=dag
    )
    data_pull_5 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runtwice_task_1_{}'.format(attempt),
        command=linux_command_2,
        dag=dag
    )
    chain_operators_3.append(data_pull_4)
    chain_operators_4.append(data_pull_5)

max_attempt_2 = 3
for attempt in range(max_attempt_2):
    data_pull_6 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runthrice_{}'.format(attempt),
        command=linux_command_7,
        dag=dag
    )
    chain_operators_5.append(data_pull_6)


end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
chain_operators_1.append(end_op)
chain_operators_2.append(end_op)
chain_operators_3.append(end_op)
chain_operators_4.append(end_op)
chain_operators_5.append(end_op)
chain_operators.append(end_op)

for i,val in enumerate(chain_operators[:-1]):
    val.set_downstream(chain_operators[i+1])
for j,val in enumerate(chain_operators_1[:-1]):
    val.set_downstream(chain_operators_1[j+1])
for k,val in enumerate(chain_operators_2[:-1]):
    val.set_downstream(chain_operators_2[k+1])

start_op >> t5 >> end_op
start_op >> t7 >> end_op

for l,val in enumerate(chain_operators_3[:-1]):
    val.set_downstream(chain_operators_3[l+1])
for m,val in enumerate(chain_operators_4[:-1]):
    val.set_downstream(chain_operators_4[m+1])
for n,val in enumerate(chain_operators_5[:-1]):
    val.set_downstream(chain_operators_5[n+1])

气流创建循环任务以运行多次

2 个答案:

其他注意事项