我需要使用oozie来安排火花作业,但我对如何使用oozie生成特定日期的文件路径并阅读scala中的路径感到困惑。示例配置有INPUT和OUTPUT,但这两个名称在几个地方使用,这使我感到困惑。如果我将$ {INPUT}传递给shell脚本,我不确定它是文件路径还是数据。因为
<property>
<name>INPUT</name>
<value>${coord:dataIn('INPUT_GCM_LOG')}</value>
</property>
此步骤种子将数据加载到值。感谢。
我有一个示例代码:
coordinator.xml
<coordinator-app name="myjob"
frequency="${coord:hours (1)}"
start="${start_time}"
end="${end_time}"
timezone="${timezone}" xmlns="uri:oozie:coordinator:0.3">
<controls>
<timeout>21600</timeout>
<concurrency>${concurrency}</concurrency>
</controls>
<datasets>
<dataset name="INPUT" frequency="${coord:hours(1)}" initial-instance="${ds_start}" timezone="${timezone}">
<uri-template>${INPUT_TEMPLATE}</uri-template>
</dataset>
<dataset name="OUTPUT" frequency="${coord:days(1)}" initial-instance="${ds_start}" timezone="${timezone}">
<uri-template>${OUTPUT_TEMPLATE}</uri-template>
</dataset>
</datasets>
<input-events>
<data-in name="INPUT" dataset="INPUT">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
<output-events>
<data-out name="OUTPUT" dataset="OUTPUT">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
<action>
<workflow>
<app-path>${app_dir}</app-path>
<configuration>
<property>
<name>INPUT</name>
<value>${coord:dataIn('INPUT_GCM_LOG')}</value>
</property>
<property>
<name>OUTPUT</name>
<value>${coord:dataOut('OUTPUT')}</value>
</property>
</configuration>
</workflow>
</action>
workflow.xml:
<action name="spark_shell" retry-max="0" retry-interval="0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${job_tracker}</job-tracker>
<name-node>${name_node}</name-node>
<prepare>
<delete path="${OUTPUT_TEMPLATE}" />
</prepare>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queue_name}</value>
</property>
<property>
<name>mapreduce.job.acl-view-job</name>
<value>*</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.memory.mb</name>
<value>6144</value>
</property>
<property>
<name>oozie.launcher.mapreduce.job.acl-view-job</name>
<value>*</value>
</property>
<property>
<name>mapreduce.job.acl-modify-job</name>
<value>*</value>
</property>
</configuration>
<exec>run.sh</exec>
<argument>${INPUT}</argument>
<argument>${OUTPUT}</argument>
<env-var>HADOOP_USER_NAME=${USER}</env-var>
<env-var>YARN_CONF_DIR=/usr/lib/spark/conf</env-var>
<env-var>HADOOP_CONF_DIR=/etc/hadoop/conf</env-var>
<file>lib/run.sh</file>
<archive>/user/bac/spark-1.6.1-bin-hadoop2.6</archive>
</shell>
<ok to="end" />
<error to="fail" />
</action>
<kill name="fail">
<message>fail : ERROR MSG - [${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name='end' />
coordinator.properties:
# Coordinator
oozie.coord.application.path=/user/xxx/xxxx
app_dir=/user/xxx/xxxx
oozie.use.system.libpath=true
oozie.user=xxxxx
# Workflow
job_tracker=mycluster
name_node=hdfs://mycluster
queue_name=xxxxx
thrift_uri=xxxx
oozie_server=xxxx
job_name=xxxx
# ACL
acl_view_job=*
acl_modify_job=*
#input datasets
INPUT_TEMPLATE=/projects/xxx/${YEAR}${MONTH}${DAY}/${HOUR}
#output datasets
OUTPUT_TEMPLATE=/projects/xxx/${YEAR}${MONTH}${DAY}/${HOUR}/output
start_time=2016-12-04T00:00Z
end_time=2017-12-05T00:00Z
ds_start=2013-01-01T00:00Z
concurrency=2
#params
timezone=UTC
USER=xxx