Apache Airflow如何循环处理多个文件

时间:2019-06-28 19:17:51

标签: airflow

嗨,我正在尝试使用apache气流处理多个文件。我尝试了其他选项,但最终使用了triggerdagrunoperator。所以基本上我有2个dags,一个预定的dag检查文件,如果找到文件,它将触发触发器dag。但我想对许多文件重复此操作。一次检查一个文件,如果文件存在,则添加参数并使用它调用触发器dag。


def conditionally_trigger(context, dag_run_obj):
    task_id = context['params']['task_id']
    task_instance = context['task_instance']
    file_type = task_instance.xcom_pull(task_id, key='file_type')
    if file_type is not None and file_type != "":
        dag_run_obj.payload = {'file_type': file_type, 'file_name': file_name, 'file_path': full_path}
        return dag_run_obj
    return None

trigger_dag_run_task = TriggerDagRunOperator(
    task_id='trigger_dag_run_task',
    trigger_dag_id="trigger_dag",
    python_callable=conditionally_trigger,
    params={'task_id': check_if_file_exists_task_id},
    dag=dag,
)

def execute_check_if_file_exists_task(*args, **kwargs):
    input_file_list = ["a","b"]

    for item in input_file_list:
        full_path = json_data[item]['input_folder_path']
        directory = os.listdir(full_path)
        for files in directory:
            if not re.match(file_name, files):
                continue
            else:
                # true
                kwargs['ti'].xcom_push(key='file_type', value=item)
                return "trigger_dag_run_task"
        #false
        return "file_not_found_task"

def execute_file_not_found_task(*args, **kwargs):
    logging.info("File Not found path.")

file_not_found_task = PythonOperator(
    task_id='file_not_found_task',
    retries=3,
    provide_context=True,
    dag=dag,
    python_callable=execute_file_not_found_task,
    op_args=[])

check_if_file_exists_task = BranchPythonOperator(
    task_id='check_if_file_exists_task',
    retries=3,
    provide_context=True,
    dag=dag,
    python_callable=execute_check_if_file_exists_task,
    op_args=[])

check_if_file_exists_task.set_downstream(trigger_dag_run_task)
check_if_file_exists_task.set_downstream(file_not_found_task)

0 个答案:

没有答案