我正在尝试制作动态工作流程。 我有这个:
我尝试使用BashOperator(调用python脚本)动态创建任务
我的天哪
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
fmsFiles=[]
memFiles=[]
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "run_scripts"
else:
return "no_script"
else:
print('No file in upload_00')
default_args = {
'owner': 'manuf2020',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('lv-manuf2020', default_args=default_args, description='airflow_manuf2020',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
run_scripts = DummyOperator(
task_id="run_scripts",
dag=dag
)
no_script= TriggerDagRunOperator(
task_id='no_script',
trigger_dag_id='lv-manuf2020',
trigger_rule='all_done',
dag=dag,
)
if os.listdir(receiptPath):
for files in os.listdir(receiptPath):
if files.startswith('FMS') and files.endswith('.csv'):
fmsFiles.append(files)
if files.startswith('MEM') and files.endswith('.csv'):
memFiles.append(files)
else:
pass
for files in fmsFiles:
run_Fms_Script = BashOperator(
task_id="fms_script_"+files,
bash_command='python3 '+scriptAirflow+'fmsScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
run_scripts.set_downstream(run_Fms_Script)
rerun_dag.set_upstream(run_Fms_Script)
for files in memFiles:
run_Mem_Script = BashOperator(
task_id="mem_script_"+files,
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
run_scripts.set_downstream(run_Mem_Script)
rerun_dag.set_upstream(run_Mem_Script)
move_csv.set_upstream(file_sensor)
run_scripts.set_upstream(move_csv)
no_script.set_upstream(move_csv)
它不像我想要的那样工作。在此循环中,它正在调用一个Python脚本,该脚本应该启动Sh脚本。它正在创建任务,但是在重新运行dag之后立即启动,而没有启动我的脚本。
for files in memFiles:
run_Mem_Script = BashOperator(
task_id="mem_script_"+files,
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
run_scripts.set_downstream(run_Mem_Script)
rerun_dag.set_upstream(run_Mem_Script)
有人可以告诉我,如何在必要时使用BashOperator并行创建动态任务(因为我这样称呼我的python脚本) 我需要类似
file_sensor >> move_csv >> run_scripts >> dymanic_task >> rerun_dag
答案 0 :(得分:0)
创建DAG文件时,所有代码仅运行一次,只有onlyCsvFiles
函数作为任务的一部分定期运行。
Airflow导入运行解释器的python文件,并在DAG的原始.py文件旁边创建.pyc文件,由于代码未更改,因此airflow不会再次运行DAG的代码,并且始终使用相同的.pyc文件在接下来的进口。
.pyc文件是在导入.py文件时由Python解释器创建的。
https://www.tutorialspoint.com/What-are-pyc-files-in-Python
为了添加或更改DAG的任务,必须创建一个定期运行解释器并更新.pyc文件的进程。
有几种方法可以做到这一点,最好的方法是利用气流来做到这一点。
我不建议使用其他方法来创建动态任务,因此,以这种态度,您需要创建另一个任务来触发python文件的解释,以用潜在的新任务“刷新” .pyc文件;它们在此循环内的运行时中表示:
for files in memFiles:
run_Mem_Script = BashOperator(
task_id="mem_script_"+files,
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
python命令触发解释并更新.pyc文件。
如下在DAG中创建独立任务(使用DAG的绝对路径编辑bash命令):
interpret_python = BashOperator(
task_id="interpret_python",
bash_command='python3 /path/to/this/file.py',
dag=dag,
)
我不建议找到一个获取当前文件路径的python函数,因为您可能会获得气流的运行路径,因为它会导入您的代码,尽管它可能会起作用。
您的新代码:(我仅将interpret_python
任务添加到您的代码中,请记住将/path/to/this/file.py
替换为DAG文件的绝对路径):
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
fmsFiles=[]
memFiles=[]
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "run_scripts"
else:
return "no_script"
else:
print('No file in upload_00')
default_args = {
'owner': 'manuf2020',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('lv-manuf2020', default_args=default_args, description='airflow_manuf2020',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
run_scripts = DummyOperator(
task_id="run_scripts",
dag=dag
)
no_script= TriggerDagRunOperator(
task_id='no_script',
trigger_dag_id='lv-manuf2020',
trigger_rule='all_done',
dag=dag,
)
interpret_python = BashOperator(
task_id="interpret_python",
bash_command='python3 /path/to/this/file.py',
dag=dag,
)
if os.listdir(receiptPath):
for files in os.listdir(receiptPath):
if files.startswith('FMS') and files.endswith('.csv'):
fmsFiles.append(files)
if files.startswith('MEM') and files.endswith('.csv'):
memFiles.append(files)
else:
pass
for files in fmsFiles:
run_Fms_Script = BashOperator(
task_id="fms_script_"+files,
bash_command='python3 '+scriptAirflow+'fmsScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
run_scripts.set_downstream(run_Fms_Script)
rerun_dag.set_upstream(run_Fms_Script)
for files in memFiles:
run_Mem_Script = BashOperator(
task_id="mem_script_"+files,
bash_command='python3 '+scriptAirflow+'memShScript.py "{{ execution_date }}"',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='lv-manuf2020',
trigger_rule='none_failed',
dag=dag,
)
run_scripts.set_downstream(run_Mem_Script)
rerun_dag.set_upstream(run_Mem_Script)
move_csv.set_upstream(file_sensor)
run_scripts.set_upstream(move_csv)
no_script.set_upstream(move_csv)
如果您有与interpret_python
任务相关的运行时错误,请尝试先cd
到气流的基本路径(airflow.cfg
目录),然后使用相对路径调用python3
。
例如,如果气流路径为/home/username/airflow
,而dag位于/home/username/airflow/dags/mydag.py
,则按以下方式定义interpret_python
:
interpret_python = BashOperator(
task_id="interpret_python",
bash_command='cd /home/username/airflow && python3 dags/mydag.py',
dag=dag,
)