气流-如果一个传感器失败设置为向下游跳过

时间:2020-06-02 00:16:03

标签: python airflow directed-acyclic-graphs

我必须检查文件A是否在存储桶中,但是(大多数情况下)还有另一个版本B。

我想检查A是否存在并启动dag,如果B不存在,我不想将dag设置为失败但成功。我尝试使用分支运算符,但我想要的是前一个dag的状态,而不是返回True / False的pyhton回调。

现在玩我只能在文档中找到的选项

enter image description here

from datetime import datetime
from datetime import timedelta

import pandas as pd
from airflow import DAG
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from airflow.contrib.sensors.gcs_sensor import GoogleCloudStorageObjectSensor
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator

default_args = {
    "owner": "me",
    "depends_on_past": False,
    "start_date": datetime(2020, 5, 6),  ## TODO : A définir.
    "end_date": datetime(2020, 5, 6),
    "sla": timedelta(minutes=2)
}


def get_db_args(area: str):
    db_config = Variable.get(f"db_{area}_dev", deserialize_json=True)
    db_user = db_config["db_user"]
    db_pwd = db_config["db_pwd"]
    db_ip = db_config["db_ip"]
    return f"postgresql+psycopg2://{db_user}:{db_pwd}@{db_ip}/geometrie-voie"


def transform_and_copy(file_extension, **context):
    # trackgeo_prepared = pd.read_parquet(f"gs://my-cloud-safe/geometrie/preprocessing/input/prepared/track_geometry_prepared_{date}.parquet")
    # trackgeo_prepared.to_csv(f"gs://my-cloud-expo/geometrie/preprocessing/input/prepared/track_geometry_prepared_{date}.csv")

    date = context['ds_nodash']

    trackgeo_monitored = pd.read_parquet(
        f"gs://my-cloud-safe-dev/geometrie/preprocessing/output/monitored/track_geometry_monitored_{date}_{file_extension}.parquet")
    trackgeo_monitored.to_csv(
        f"gs://lmy-cloud-expo-dev/geometrie/preprocessing/output/monitored/track_geometry_monitored_{date}_{file_extension}.csv")

    defaults_bruts = pd.read_parquet(
        f"gs://my-cloud-safe-dev/geometrie/preprocessing/output/defauts_bruts/defauts_bruts_{date}_{file_extension}.parquet")
    defaults_bruts.to_csv(
        f"gs://my-cloud-expo-dev/geometrie/preprocessing/output/defauts_bruts/defauts_bruts_{date}_{file_extension}.csv")



with DAG(
        "geometrie-preprocessing-dev",
        default_args=default_args,
        schedule_interval="@daily",
        catchup=True
) as dag:
    preprocessing_started = DummyOperator(
        task_id="preprocessing_started"
    )

    # File A

    sensor_gcs_A = GoogleCloudStorageObjectSensor(
        task_id="gcs-sensor-A",
        bucket="my-cloud-safe-dev",
        object="geometrie/original/track_geometry_{{ ds_nodash }}_A.csv",
        google_cloud_conn_id="gcp_conn",
        poke_interval=50,
        soft_fail=True,
    )

    k8s_trackgeo_cleaned_A = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-trackgeo-cleaned-A",
        name="trackgeo-cleaned-A",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$TRACKGEO_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", "A"],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
    )

    k8s_nl_normalisation_A = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-nl-normalisation-A",
        name="nl-normalisation-A",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$NL_NORMALISATION_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", get_db_args("safe"), "A"],
        env_vars={"ENV_DEV": "stagging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
    )

    k8s_calcul_default_A = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-calcul_default-A",
        name="calcul_default-A",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$CALCUL_DEFAULT_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", get_db_args("safe"), "A"],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
    )

    preprocessing_A_finised = DummyOperator(
        task_id="preprocessing_A_finished",
        trigger_rule="none_failed"
    )

    transform_and_copy_to_expo_A = PythonOperator(
        task_id="transform_and_copy_to_expo_A",
        provide_context=True,
        python_callable=transform_and_copy,
        op_kwargs={"file_extension": "A"},
        trigger_rule="none_failed"
    )

    # File B

    sensor_gcs_B = GoogleCloudStorageObjectSensor(
        task_id="gcs-sensor-B",
        bucket="my-cloud-safe-dev",
        object="geometrie/original/track_geometry_{{ ds_nodash }}_B.csv",
        google_cloud_conn_id="gcp_conn",
        poke_interval=50,
        soft_fail=True,
    )

    k8s_trackgeo_cleaned_B = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-trackgeo-cleaned-B",
        name="trackgeo-cleaned_B",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$TRACKGEO_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", "B"],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
        # resources={
        #     # 'request_cpu': '500m',
        #     'request_memory': '4Gi',
        #     # 'limit_cpu': '2000m',
        #     'limit_memory': '10Gi'
        # }
    )

    k8s_nl_normalisation_B = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-nl-normalisation-B",
        name="nl-normalisation-B",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$NL_NORMALISATION_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", get_db_args("safe"), "B"],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
    )

    k8s_calcul_default_B = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s-calcul_default-B",
        name="calcul_default-B",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$CALCUL_DEFAULT_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds_nodash }}", get_db_args("safe"), "B"],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="none_failed"
    )

    preprocessing_B_finised = DummyOperator(
        task_id="preprocessing_B_finished",
        trigger_rule="none_failed"
    )

    transform_and_copy_to_expo_B = PythonOperator(
        task_id="transform_and_copy_to_expo_B",
        provide_context=True,
        python_callable=transform_and_copy,
        op_kwargs={"file_extension": "B"},
        trigger_rule="none_failed"
    )

    k8s_copy_db_to_lab = KubernetesPodOperator(
        namespace="geometrie-voie-dev",  # Nom du fichier d'execution
        task_id="k8s_copy_db_to_lab",
        name="copy_db_to_lab",  # Nom du pod
        node_selectors={  # selection du node (mutualiser ou spécifique)
            "pool": "jobs"
        },
        image="$COPY_DB_TO_LAB_IMAGE_DEV:$CI_COMMIT_SHA",  # Image docker à déployer, var de gitlab-runner
        cmds=["python", "-u", "main.py"],
        arguments=["{{ ds }}", get_db_args("safe"), get_db_args("lab")],
        env_vars={"ENV_DEV": "staging"},
        get_logs=True,
        in_cluster=False,
        trigger_rule="one_success",
        wait_for_downstream="true"
    )

    # TODO : Ajout un dag de lecture et de vérifications
    preprocessing_started >> [sensor_gcs_A, sensor_gcs_B]

    sensor_gcs_A >> k8s_trackgeo_cleaned_A >> [k8s_nl_normalisation_A,
                                               k8s_calcul_default_A] >> preprocessing_A_finised >> transform_and_copy_to_expo_A
    sensor_gcs_B >> k8s_trackgeo_cleaned_B >> [k8s_nl_normalisation_B,
                                               k8s_calcul_default_B] >> preprocessing_B_finised >> transform_and_copy_to_expo_B
    [transform_and_copy_to_expo_A, transform_and_copy_to_expo_B] >> k8s_copy_db_to_lab

0 个答案:

没有答案