我想自定义我的 DAG 以在成功或失败时调用 datarbicks 笔记本。我创建了两个不同的函数来根据成功/失败案例调用 databricks 笔记本。成功或失败回调函数正在调用,但 databricks 笔记本未执行。这是示例代码。
def task_success_callback(context):
""" task_success callback """
context['task_instance'].task_id
print("success case")
dq_notebook_success_task_params = {
'existing_cluster_id': Variable.get("DATABRICKS_CLUSTER_ID"),
'notebook_task': {
'notebook_path': '/AAA/Airflow/Operators/audit_file_operator',
'base_parameters': {
"root": "dbfs:/mnt/aaa",
"audit_file_path": "/success_file_path/",
"table_name": "sample_data_table",
"audit_flag": "success"
}
}
}
DatabricksSubmitRunOperator(
task_id="weather_table_task_id",
databricks_conn_id='databricks_conn',
json=dq_notebook_success_task_params,
do_xcom_push=True,
secrets=[secret.Secret(
deploy_type='env',
deploy_target=None,
secret='adf-service-principal'
), secret.Secret(
deploy_type='env',
deploy_target=None,
secret='postgres-credentials',
)],
)
def task_failure_callback(context):
""" task_success callback """
context['task_instance'].task_id
print("failure case")
dq_notebook_failure_task_params = {
'existing_cluster_id': Variable.get("DATABRICKS_CLUSTER_ID"),
'notebook_task': {
'notebook_path': '/AAA/Airflow/Operators/audit_file_operator',
'base_parameters': {
"root": "dbfs:/mnt/aaa",
"audit_file_path": "/failure_file_path/",
"table_name": "sample_data_table",
"audit_flag": "failure"
}
}
}
DatabricksSubmitRunOperator(
task_id="weather_table_task_id",
databricks_conn_id='databricks_conn',
json=dq_notebook_failure_task_params,
do_xcom_push=True,
secrets=[secret.Secret(
deploy_type='env',
deploy_target=None,
secret='adf-service-principal'
), secret.Secret(
deploy_type='env',
deploy_target=None,
secret='postgres-credentials',
)],
)
DEFAULT_ARGS = {
"owner": "admin",
"depends_on_past": False,
"start_date": datetime(2020, 9, 23),
"on_success_callback": task_success_callback,
"on_failure_callback": task_failure_callback,
"email": ["airflow@airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(seconds=10),
}
==================
Remaining DAG code
==================
答案 0 :(得分:2)
在 Airflow 中,每个运算符都有定义运算符逻辑的 execute()
方法。当您创建工作流时,Airflow 会初始化构造函数,渲染模板并为您调用 execute 方法。但是,当您在 python 函数中定义运算符时,您还需要自己处理。
所以当你写:
def task_success_callback(context):
DatabricksSubmitRunOperator(..)
您在这里所做的只是初始化 DatabricksSubmitRunOperator
接触器。您没有调用运算符逻辑。
您需要做的是:
def task_success_callback(context):
op = DatabricksSubmitRunOperator(..)
op.execute()
答案 1 :(得分:0)
TableList = collections.namedtuple(
"table_list",
"table_name audit_file_name",
)
LIST_OF_TABLES = [
TableList(
table_name="table1",
audit_file_name="/testdata/Audit_files/",
),
TableList(
table_name="table2",
audit_file_name="/testdata/Audit_files/",
),
TableList(
table_name="table3",
audit_file_name="/testdata/Audit_files/",
),
TableList(
table_name="table4",
audit_file_name="/testdata/Audit_files/",
)
]
for table in LIST_OF_TABLES:
DEFAULT_ARGS = {
"owner": "admin",
"depends_on_past": False,
"start_date": datetime(2020, 9, 23),
"on_success_callback": partial(task_success_callback,table.table_name,table.audit_file_name),
"on_failure_callback": partial(task_failure_callback,table.table_name,table.audit_file_name),
"email": ["airflow@airflow.com"],
"email_on_failure": False,
"email_on_retry": False,
"retries": 1,
"retry_delay": timedelta(seconds=10),
}
WORKFLOW = DAG(
'test_dag',
default_args=DEFAULT_ARGS,
schedule_interval="30 3 * * 1",
catchup=False,
)