我必须检查文件A是否在存储桶中,但是(大多数情况下)还有另一个版本B。
我想检查A是否存在并启动dag,如果B不存在,我不想将dag设置为失败但成功。我尝试使用分支运算符,但我想要的是前一个dag的状态,而不是返回True / False的pyhton回调。
现在玩我只能在文档中找到的选项
from datetime import datetime
from datetime import timedelta
import pandas as pd
from airflow import DAG
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from airflow.contrib.sensors.gcs_sensor import GoogleCloudStorageObjectSensor
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
"owner": "me",
"depends_on_past": False,
"start_date": datetime(2020, 5, 6), ## TODO : A définir.
"end_date": datetime(2020, 5, 6),
"sla": timedelta(minutes=2)
}
def get_db_args(area: str):
db_config = Variable.get(f"db_{area}_dev", deserialize_json=True)
db_user = db_config["db_user"]
db_pwd = db_config["db_pwd"]
db_ip = db_config["db_ip"]
return f"postgresql+psycopg2://{db_user}:{db_pwd}@{db_ip}/geometrie-voie"
def transform_and_copy(file_extension, **context):
# trackgeo_prepared = pd.read_parquet(f"gs://my-cloud-safe/geometrie/preprocessing/input/prepared/track_geometry_prepared_{date}.parquet")
# trackgeo_prepared.to_csv(f"gs://my-cloud-expo/geometrie/preprocessing/input/prepared/track_geometry_prepared_{date}.csv")
date = context['ds_nodash']
trackgeo_monitored = pd.read_parquet(
f"gs://my-cloud-safe-dev/geometrie/preprocessing/output/monitored/track_geometry_monitored_{date}_{file_extension}.parquet")
trackgeo_monitored.to_csv(
f"gs://lmy-cloud-expo-dev/geometrie/preprocessing/output/monitored/track_geometry_monitored_{date}_{file_extension}.csv")
defaults_bruts = pd.read_parquet(
f"gs://my-cloud-safe-dev/geometrie/preprocessing/output/defauts_bruts/defauts_bruts_{date}_{file_extension}.parquet")
defaults_bruts.to_csv(
f"gs://my-cloud-expo-dev/geometrie/preprocessing/output/defauts_bruts/defauts_bruts_{date}_{file_extension}.csv")
with DAG(
"geometrie-preprocessing-dev",
default_args=default_args,
schedule_interval="@daily",
catchup=True
) as dag:
preprocessing_started = DummyOperator(
task_id="preprocessing_started"
)
# File A
sensor_gcs_A = GoogleCloudStorageObjectSensor(
task_id="gcs-sensor-A",
bucket="my-cloud-safe-dev",
object="geometrie/original/track_geometry_{{ ds_nodash }}_A.csv",
google_cloud_conn_id="gcp_conn",
poke_interval=50,
soft_fail=True,
)
k8s_trackgeo_cleaned_A = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-trackgeo-cleaned-A",
name="trackgeo-cleaned-A", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$TRACKGEO_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", "A"],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
)
k8s_nl_normalisation_A = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-nl-normalisation-A",
name="nl-normalisation-A", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$NL_NORMALISATION_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", get_db_args("safe"), "A"],
env_vars={"ENV_DEV": "stagging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
)
k8s_calcul_default_A = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-calcul_default-A",
name="calcul_default-A", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$CALCUL_DEFAULT_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", get_db_args("safe"), "A"],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
)
preprocessing_A_finised = DummyOperator(
task_id="preprocessing_A_finished",
trigger_rule="none_failed"
)
transform_and_copy_to_expo_A = PythonOperator(
task_id="transform_and_copy_to_expo_A",
provide_context=True,
python_callable=transform_and_copy,
op_kwargs={"file_extension": "A"},
trigger_rule="none_failed"
)
# File B
sensor_gcs_B = GoogleCloudStorageObjectSensor(
task_id="gcs-sensor-B",
bucket="my-cloud-safe-dev",
object="geometrie/original/track_geometry_{{ ds_nodash }}_B.csv",
google_cloud_conn_id="gcp_conn",
poke_interval=50,
soft_fail=True,
)
k8s_trackgeo_cleaned_B = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-trackgeo-cleaned-B",
name="trackgeo-cleaned_B", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$TRACKGEO_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", "B"],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
# resources={
# # 'request_cpu': '500m',
# 'request_memory': '4Gi',
# # 'limit_cpu': '2000m',
# 'limit_memory': '10Gi'
# }
)
k8s_nl_normalisation_B = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-nl-normalisation-B",
name="nl-normalisation-B", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$NL_NORMALISATION_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", get_db_args("safe"), "B"],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
)
k8s_calcul_default_B = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s-calcul_default-B",
name="calcul_default-B", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$CALCUL_DEFAULT_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds_nodash }}", get_db_args("safe"), "B"],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="none_failed"
)
preprocessing_B_finised = DummyOperator(
task_id="preprocessing_B_finished",
trigger_rule="none_failed"
)
transform_and_copy_to_expo_B = PythonOperator(
task_id="transform_and_copy_to_expo_B",
provide_context=True,
python_callable=transform_and_copy,
op_kwargs={"file_extension": "B"},
trigger_rule="none_failed"
)
k8s_copy_db_to_lab = KubernetesPodOperator(
namespace="geometrie-voie-dev", # Nom du fichier d'execution
task_id="k8s_copy_db_to_lab",
name="copy_db_to_lab", # Nom du pod
node_selectors={ # selection du node (mutualiser ou spécifique)
"pool": "jobs"
},
image="$COPY_DB_TO_LAB_IMAGE_DEV:$CI_COMMIT_SHA", # Image docker à déployer, var de gitlab-runner
cmds=["python", "-u", "main.py"],
arguments=["{{ ds }}", get_db_args("safe"), get_db_args("lab")],
env_vars={"ENV_DEV": "staging"},
get_logs=True,
in_cluster=False,
trigger_rule="one_success",
wait_for_downstream="true"
)
# TODO : Ajout un dag de lecture et de vérifications
preprocessing_started >> [sensor_gcs_A, sensor_gcs_B]
sensor_gcs_A >> k8s_trackgeo_cleaned_A >> [k8s_nl_normalisation_A,
k8s_calcul_default_A] >> preprocessing_A_finised >> transform_and_copy_to_expo_A
sensor_gcs_B >> k8s_trackgeo_cleaned_B >> [k8s_nl_normalisation_B,
k8s_calcul_default_B] >> preprocessing_B_finised >> transform_and_copy_to_expo_B
[transform_and_copy_to_expo_A, transform_and_copy_to_expo_B] >> k8s_copy_db_to_lab