Question

我是python和airflow的新手，我使用GCP composer环境创建DAG。
在此python代码中，我创建了两个任务，一个任务是读取zip或csv文件，另一个任务是创建dataproc集群。在一项任务中，我调用一种方法readYML，该方法正在读取yml配置文件中的dataproc集群参数，例如cluster-name，project_id等，而在第二项任务中，我将使用相同的参数，请参见以下代码，以更好地理解

# Importing Modules

from airflow import DAG

from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from zipfile import ZipFile

from airflow.models import Variable
import yaml
from google.cloud import storage
from airflow.contrib.operators import dataproc_operator
import pandas as pd


global cfg

def readYML():
    print("inside readzip")
    file_name = "/home/airflow/gcs/data/cluster_config.yml"
    with open(file_name, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)
    print("inside readYML method : ", cfg['configs']['project_id'])


def iterate_bucket():
    global blobs
    bucket_name = 'europe-west1-airflow-test-9bbb5fc7-bucket'
    storage_client = storage.Client.from_service_account_json(
        '/home/airflow/gcs/data/service_account_key_gcp_compute_bmg.json')
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs()


def print_PcsvData():
    iterate_bucket()
    readYML()
    global readPcsv

    for blob in blobs:
        if "physical.zip" in blob.name:
            print("hello : ", blob.name)
            file_name = "/home/airflow/gcs/" + blob.name

    with ZipFile(file_name, 'r') as zip:
        # printing all the contents of the zip file
        for info in zip.infolist():
            readfilename = info.filename
            print(readfilename)

    readPcsv = pd.read_csv("/home/airflow/gcs/data/" + readfilename)

    print("physi cal.csv : ", readPcsv)
    print('Done!')


dag_name = Variable.get("dag_name")

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.now(),
    'email': ['airflow@example.com'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'cluster_name': cfg['configs']['cluster_name'],   
 }

  # Instantiate a DAG

   dag = DAG(dag_id='read_yml', default_args=default_args, 
   schedule_interval=timedelta(days=1))

 # Creating Tasks   

t1 = PythonOperator(task_id='Raw1', python_callable=print_PcsvData, 
dag=dag)

create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
    task_id='create_dataproc_cluster',
    project_id=cfg['configs']['project_id'],
    cluster_name=cfg['configs']['cluster_name'],
    num_workers=cfg['configs']['num_workers'],
    zone=cfg['configs']['zone'],
    master_machine_type=cfg['configs']['master_machine_type'],
    worker_machine_type=cfg['configs']['worker_machine_type'],
    dag=dag)

t1 >> create_dataproc_cluster

在此代码中，我想全局使用cfg变量，在默认的args中，我也想访问此变量，但出现错误，我不知道它的范围相关问题，甚至我在readYML方法中声明了cfg变量而且仍然错误仍然存在。任何帮助，将不胜感激。预先感谢

Answer 1

检查下面应使用的DAG文件：

您应该进行的一些更改：

更改全局变量，并从函数返回值
从不使用datetime.now()-https://airflow.apache.org/faq.html#what-s-the-deal-with-start-date

更新的文件：

# Importing Modules

from airflow import DAG

from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
from zipfile import ZipFile

from airflow.models import Variable
import yaml
from google.cloud import storage
from airflow.contrib.operators import dataproc_operator
import pandas as pd



def readYML():
    print("inside readzip")
    file_name = "/home/airflow/gcs/data/cluster_config.yml"
    with open(file_name, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)
    print("inside readYML method : ", cfg['configs']['project_id'])
    return cfg


def iterate_bucket():
    bucket_name = 'europe-west1-airflow-test-9bbb5fc7-bucket'
    storage_client = storage.Client.from_service_account_json(
        '/home/airflow/gcs/data/service_account_key_gcp_compute_bmg.json')
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs()
    return blobs


def print_PcsvData():
    blobs = iterate_bucket()

    for blob in blobs:
        if "physical.zip" in blob.name:
            print("hello : ", blob.name)
            file_name = "/home/airflow/gcs/" + blob.name

    with ZipFile(file_name, 'r') as zip:
        # printing all the contents of the zip file
        for info in zip.infolist():
            readfilename = info.filename
            print(readfilename)

    readPcsv = pd.read_csv("/home/airflow/gcs/data/" + readfilename)

    print("physi cal.csv : ", readPcsv)
    print('Done!')
    return readPcsv

dag_name = Variable.get("dag_name")

cfg = readYML()

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(2),
    'email': ['airflow@example.com'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'cluster_name': cfg['configs']['cluster_name'],   
 }

# Instantiate a DAG

dag = DAG(dag_id='read_yml', default_args=default_args, 
schedule_interval=timedelta(days=1))

# Creating Tasks   

t1 = PythonOperator(task_id='Raw1', python_callable=print_PcsvData, 
dag=dag)

create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
    task_id='create_dataproc_cluster',
    project_id=cfg['configs']['project_id'],
    cluster_name=cfg['configs']['cluster_name'],
    num_workers=cfg['configs']['num_workers'],
    zone=cfg['configs']['zone'],
    master_machine_type=cfg['configs']['master_machine_type'],
    worker_machine_type=cfg['configs']['worker_machine_type'],
    dag=dag)

t1 >> create_dataproc_cluster

损坏的DAG：[/home/airflow/gcs/dags/airflow_test_task.py]名称'cfg'未定义

1 个答案: