我目前正在尝试使用Oozie触发和安排一些PySpark作业。我的群集正在使用HDP 2.6和Ambari。
我使用SparkAction创建了我的工作流程,但是当尝试运行时,我收到以下错误:
Failing Oozie Launcher, Main class [org.apache.oozie.action.hadoop.SparkMain], main() threw exception, org/apache/spark/deploy/SparkSubmit
java.lang.NoClassDefFoundError: org/apache/spark/deploy/SparkSubmit
at org.apache.oozie.action.hadoop.SparkMain.run(SparkMain.java:222)
at org.apache.oozie.action.hadoop.LauncherMain.run(LauncherMain.java:58)
at org.apache.oozie.action.hadoop.SparkMain.main(SparkMain.java:62)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.oozie.action.hadoop.LauncherMapper.map(LauncherMapper.java:237)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:453)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:170)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1866)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:164)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.deploy.SparkSubmit
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 16 more
workflow.xml:
<workflow-app name="PySpark" xmlns="uri:oozie:workflow:0.5"> <global>
<configuration>
<property>
<name>oozie.launcher.yarn.app.mapreduce.am.env</name>
<value>PYSPARK_ARCHIVES_PATH=pyspark.zip</value>
</property>
</configuration> </global>
<start to="spark-846b"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="spark-846b">
<spark xmlns="uri:oozie:spark-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>${master}</master>
<name>MyApp</name>
<jar>${nameNode}/analysis/000001/oozie/dummy.py</jar>
<spark-opts>--conf spark.driver.extraJavaOptions=-Dhdp.version=2.6.1.0-129 --conf spark.yarn.archive=hdfs://hadoop1.hadoopcwb:8020/hdp/apps/2.6.1.0-129/spark2/spark2-hdp-yarn-archive.tar.gz --py-files pyspark.zip,py4j-0.10.4-src.zip</spark-opts>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>
job.properties:
nameNode=hdfs://hadoop1.hadoopcwb:8020
jobTracker=hadoop1.hadoopcwb:8032
master=yarn-cluster
queueName=default
oozie.use.system.libpath=true
oozie.wf.application.path=${nameNode}/analysis/000001/oozie
作业python
from pyspark import SparkConf, SparkContext
from operator import add
def main():
conf = SparkConf().setAppName("MyApp")
sc = SparkContext(conf=conf)
if __name__ == '__main__':
main()