我正在使用Airflow EMR操作员创建一个AWS EMR集群,该集群运行S3中包含的Jar文件,然后将输出写回到S3。似乎能够使用来自S3的Jar文件运行该作业,但是我无法获得将其写入S3的输出。作为AWS EMR CLI Bash命令运行时,我能够将其写入S3,但是我需要使用Airflow EMR Operators来完成。我在Airflow步骤配置和Jar文件的环境配置中都设置了S3输出目录,但仍然无法使操作员对其进行写操作。
这是我的Airflow DAG的代码
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.emr_create_job_flow_operator import EmrCreateJobFlowOperator
from airflow.contrib.operators.emr_add_steps_operator import EmrAddStepsOperator
from airflow.contrib.operators.emr_terminate_job_flow_operator import EmrTerminateJobFlowOperator
from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.s3_file_transform_operator import S3FileTransformOperator
DEFAULT_ARGS = {
'owner': 'AIRFLOW_USER',
'depends_on_past': False,
'start_date': datetime(2019, 9, 9),
'email': ['airflow@example.com'],
'email_on_failure': False,
'email_on_retry': False
}
RUN_STEPS = [
{
"Name": "run-custom-create-emr",
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Jar": "command-runner.jar",
"Args": [
"spark-submit", "--deploy-mode", "cluster", "--master", "yarn", "--conf",
"spark.yarn.submit.waitAppCompletion=false", "--class", "CLASSPATH",
"s3://INPUT_JAR_FILE",
"s3://OUTPUT_DIR"
]
}
}
]
JOB_FLOW_OVERRIDES = {
"Name": "JOB_NAME",
"LogUri": "s3://LOG_DIR/",
"ReleaseLabel": "emr-5.23.0",
"Instances": {
"Ec2KeyName": "KP_USER_NAME",
"Ec2SubnetId": "SUBNET",
"EmrManagedMasterSecurityGroup": "SG-ID",
"EmrManagedSlaveSecurityGroup": "SG-ID",
"InstanceGroups": [
{
"Name": "Master nodes",
"Market": "ON_DEMAND",
"InstanceRole": "MASTER",
"InstanceType": "m4.large",
"InstanceCount": 1
},
{
"Name": "Slave nodes",
"Market": "ON_DEMAND",
"InstanceRole": "CORE",
"InstanceType": "m4.large",
"InstanceCount": 1
}
],
"TerminationProtected": True,
"KeepJobFlowAliveWhenNoSteps": True,
},
"Applications": [
{
"Name": "Spark"
},
{
"Name": "Ganglia"
},
{
"Name": "Hadoop"
},
{
"Name": "Hive"
}
],
"JobFlowRole": "ROLE_NAME",
"ServiceRole": "ROLE_NAME",
"ScaleDownBehavior": "TERMINATE_AT_TASK_COMPLETION",
"EbsRootVolumeSize": 10,
"Tags": [
{
"Key": "Country",
"Value": "us"
},
{
"Key": "Environment",
"Value": "dev"
}
]
}
dag = DAG(
'AWS-EMR-JOB',
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=2),
schedule_interval=None
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_connection_CustomCreate',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=RUN_STEPS,
dag=dag
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_remover = EmrTerminateJobFlowOperator(
task_id='remove_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)
有人对我如何解决这个问题有任何想法吗?任何帮助将不胜感激。
答案 0 :(得分:0)
我相信我已经解决了我的问题。在真正深入研究了所有本地Airflow日志和S3 EMR日志之后,我发现了Hadoop内存异常,因此我增加了运行EMR的核心数量,并且现在看来可以正常工作。