我正在尝试运行气流DAG。它指的是创建dataproc集群的模板。当我尝试运行DAG时,仅显示dag_id,其他字段保持空白。有人可以解释为什么会这样或我要去哪里吗? 下面是我的模板:
{
"id": "newone",
"placement": {
"managedCluster": {
"clusterName": "testinstant",
"config": {
"gceClusterConfig": {
"zoneUri": "https://www.googleapis.com/compute/v1/projects/abhilash-thomas-sandbox/zones/us-central1-a",
"serviceAccountScopes": [
"https://www.googleapis.com/auth/cloud-platform/"
]
},
"masterConfig": {
"numInstances": 1,
"machineTypeUri": "https://www.googleapis.com/compute/v1/projects/abhilash-thomas-sandbox/zones/us-central1-a/machineTypes/n1-standard-4",
"diskConfig": {
"bootDiskSizeGb": 100,
"bootDiskType": "pd-standard"
}
},
"workerConfig": {
"numInstances": 2,
"machineTypeUri": "https://www.googleapis.com/compute/v1/projects/abhilash-thomas-sandbox/zones/us-central1-a/machineTypes/n1-standard-4",
"diskConfig": {
"bootDiskSizeGb": 100,
"bootDiskType": "pd-standard"
}
},
"softwareConfig": {
"properties": {
"dataproc:dataproc.allow.zero.workers": "true"
}
},
"initializationActions": [{
"executableFile": "gs://loadtests/cloud-sql-proxy.sh",
"executionTimeout": "600s"
}]
}
}
},
"jobs": [{
"hiveJob": {
"queryFileUri": "gs://loadtests/testplugin/queryfile.sql"
},
"stepId": "job0"
}]
}
这是我的DAG文件:
from airflow import DAG
from datetime import timedelta, datetime
from airflow import models
from airflow.contrib.operators.dataproc_operator import DataprocWorkflowTemplateInstantiateInlineOperator
import json
from google.cloud import storage
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2017, 1, 1),
'email': None,
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'project_id': models.Variable.get('gcp_project')
}
client = storage.Client()
bucket = client.get_bucket('bucketname')
configblob = bucket.get_blob('template.json')
template_dict = eval(configblob.download_as_string().decode())
print(template_dict)
# Create a basic DAG with our args
dag = DAG(
dag_id='inline_instantiate',
default_args=default_args,
# A common interval to make the job fire when we run it
schedule_interval="@once",
)
startdataproc = DataprocWorkflowTemplateInstantiateInlineOperator(
task_id = 'instantiate_inline',
template = 'template_dict',
project_id = 'my-project',
region = 'global',
gcp_conn_id='google_cloud_default',
dag = dag
)