我有这个scala代码,我希望使用 spark-submit 命令在终端上运行。在intellij IDE中运行它似乎没有问题。
代码
package com.scryAnalytics.NLPAnnotationController.Work
import java.net.MalformedURLException
import java.util.{ArrayList, Arrays}
import com.scryAnalytics.NLPAnnotationController.Configuration.{VOCPConstants, VocpConfiguration}
import com.scryAnalytics.NLPAnnotationController.DAO.NLPEntitiesDAO
import com.scryAnalytics.NLPGeneric.{NLPEntities, _}
import com.vocp.ner.main.GateNERImpl
import gate.util.GateException
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HTableDescriptor, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}
class NLPProcessingLog {
var log: Logger = Logger.getLogger(classOf[NLPProcessingLog])
log.info("Logger Initialized .....")
}
object NlpProcessing {
val logger = new NLPProcessingLog
@throws(classOf[Exception])
def nlpAnnotationExtraction(conf: org.apache.hadoop.conf.Configuration, batchString: String): Int = {
logger.log.info("In Main Object..")
//Initializing Spark Context
val sc = new SparkContext(new SparkConf().setAppName("NLPAnnotationController").setMaster("local"))
val batchId =
if (batchString == "newbatch")
java.lang.Long.toString(System.currentTimeMillis())
else batchString
conf.set("batchId", batchId)
val inputCfs = Arrays.asList(conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(","): _*)
try {
conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
admin.createTable(tableDesc)
}
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val processedFilteredRDD = hBaseRDD.map(x => x._2).filter { result =>
val flag = Bytes.toString(result.getValue(Bytes.toBytes("f"),
Bytes.toBytes("is_processed")))
(flag == null) || (flag == 0)
}
println(processedFilteredRDD.count())
val messageRDD = processedFilteredRDD.filter { x => x != null }.map { result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("message")))
(Bytes.toString(result.getRow()), message)
}
println("Number of partitions " + messageRDD.getNumPartitions)
val pluginHome = conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE)
val requiredNLPEntities = new ArrayList[NLPEntities]()
requiredNLPEntities.add(NLPEntities.POS_TAGGER)
requiredNLPEntities.add(NLPEntities.VP_CHUNKER)
requiredNLPEntities.add(NLPEntities.NP_CHUNKER)
val nlpGenericRDD = messageRDD.mapPartitions { iter =>
val nlpModule = new GateGenericNLP(pluginHome, requiredNLPEntities)
iter.map { x =>
val nlpGenericJson = nlpModule.generateNLPEntities(x._2)
val genericNLPObject = Utility.jsonToGenericNLP(nlpGenericJson)
(x._1, x._2, genericNLPObject)
}
}
val requiredNEREntities = new ArrayList[String]()
requiredNEREntities.add("DRUG")
requiredNEREntities.add("SE")
requiredNEREntities.add("REG")
requiredNEREntities.add("ALT_THERAPY")
requiredNEREntities.add("ALT_DRUG")
val nlpRDD = nlpGenericRDD.mapPartitions { iter =>
val nerModule = new GateNERImpl(pluginHome, requiredNEREntities)
iter.map { x =>
val nerJson = nerModule.generateNER(x._2, Utility.objectToJson(x._3))
val nerJsonObject = Utility.jsonToGateNer(nerJson)
val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
nlpEntities.setToken(x._3.getToken())
nlpEntities.setSpaceToken(x._3.getSpaceToken())
nlpEntities.setSentence(x._3.getSentence())
nlpEntities.setSplit(x._3.getSplit())
nlpEntities.setVG(x._3.getVG)
nlpEntities.setNounChunk(x._3.getNounChunk)
nlpEntities.setDRUG(nerJsonObject.getDRUG())
nlpEntities.setREG(nerJsonObject.getREG())
nlpEntities.setSE(nerJsonObject.getSE())
nlpEntities.setALT_DRUG(nerJsonObject.getALT_DRUG())
nlpEntities.setALT_THERAPY(nerJsonObject.getALT_THERAPY())
(x._1, nlpEntities)
}
}
//outputRDD.foreach(println)
val newRDD = nlpRDD.map { k => convertToPut(k) }
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
return 0
} catch {
case e: MalformedURLException => {
e.printStackTrace()
return 1
}
case e: GateException =>
{
e.printStackTrace()
return 1
}
}
}
def convertToPut(genericNlpWithRowKey: (String, NLPEntitiesDAO)): (ImmutableBytesWritable, Put) = {
val rowkey = genericNlpWithRowKey._1
val genericNLP = genericNlpWithRowKey._2
val put = new Put(Bytes.toBytes(rowkey))
val genCFDataBytes = Bytes.toBytes("gen")
val nerCFDataBytes = Bytes.toBytes("ner")
val flagCFDataBytes = Bytes.toBytes("f")
put.add(genCFDataBytes, Bytes.toBytes("token"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getToken()))));
put.add(genCFDataBytes, Bytes.toBytes("spaceToken"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getSpaceToken()))));
put.add(genCFDataBytes, Bytes.toBytes("sentence"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getSentence()))));
put.add(genCFDataBytes, Bytes.toBytes("verbGroup"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getVG()))));
put.add(genCFDataBytes, Bytes.toBytes("split"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getSplit()))));
put.add(genCFDataBytes, Bytes.toBytes("nounChunk"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getNounChunk()))));
put.add(nerCFDataBytes, Bytes.toBytes("drug"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getDRUG()))))
put.add(nerCFDataBytes, Bytes.toBytes("sideEffect"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getSE()))))
put.add(nerCFDataBytes, Bytes.toBytes("regimen"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getREG()))))
put.add(nerCFDataBytes, Bytes.toBytes("altTherapy"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getALT_THERAPY()))))
put.add(nerCFDataBytes, Bytes.toBytes("altDrug"),
Bytes.toBytes(Utility.objectToJson((genericNLP.getALT_DRUG()))))
put.add(flagCFDataBytes, Bytes.toBytes("is_processed"),
Bytes.toBytes("1"))
put.add(flagCFDataBytes, Bytes.toBytes("dStatus"),
Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("rStatus"),
Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("adStatus"),
Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("atStatus"),
Bytes.toBytes("0"))
(new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put)
}
def pipeLineExecute(args: Array[String]): Int = {
var batchString = ""
val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
" -batchId batchId / -newbatch \n"
if (args.length == 0) {
System.err.println(usage)
return -1
}
val conf = VocpConfiguration.create
for (i <- 0 until args.length by 2) {
if ("-inputTable" == args(i)) {
conf.set(VOCPConstants.INPUTTABLE, args(i + 1))
} else if ("-outputTable" == args(i)) {
conf.set(VOCPConstants.OUTPUTTABLE, args(i + 1))
} else if ("-batchId" == args(i)) {
batchString = args(i)
} else if ("-newbatch" == args(i)) {
batchString = "newbatch"
} else {
throw new IllegalArgumentException("arg " + args(i) + " not recognized")
}
}
val result = nlpAnnotationExtraction(conf, batchString)
result
}
def main(args: Array[String]) {
val res = pipeLineExecute(args)
System.exit(res)
}
}
我正在尝试使用spark-submit创建一个胖jar文件。 的pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.scryAnalytics</groupId>
<artifactId>NLPAnnotationController</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>NLPAnnotationController2</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>2.6.0-cdh5.7.2</hadoop.version>
<jdk.version>1.7</jdk.version>
<sdk.version>2.10.5</sdk.version>
<hbase.version>0.98.16-hadoop2</hbase.version>
</properties>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.10</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.2.0-cdh5.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit-dep</artifactId>
<version>4.8.2</version>
</dependency>
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-core</artifactId>
<version>8.1</version>
</dependency>
<dependency>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-compiler-jdt</artifactId>
<version>4.3.2-P20140317-1600</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.xstream</groupId>
<artifactId>xstream</artifactId>
<version>1.4.8</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
<version>1.9.13</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.13</version>
</dependency>
<dependency>
<groupId>com.scryAnalytics</groupId>
<artifactId>NLPGeneric</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>NER</groupId>
<artifactId>NER</artifactId>
<version>1.2</version>
</dependency>
</dependencies>
<build>
<finalName>NLPAnnotationController</finalName>
<plugins>
<!-- download source code in Eclipse, best practice -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<version>2.9</version>
<configuration>
<downloadSources>true</downloadSources>
<downloadJavadocs>false</downloadJavadocs>
</configuration>
</plugin>
<!-- Set a compiler level -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>${jdk.version}</source>
<target>${jdk.version}</target>
</configuration>
</plugin>
<!-- Maven Assembly Plugin -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<!-- get all project dependencies -->
<descriptors>
<descriptor>src/main/assembly/hadoop-job.xml</descriptor>
</descriptors>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<!-- MainClass in mainfest make a executable jar -->
<archive>
<manifest>
<mainClass>com.scryAnalytics.NLPAnnotationController.Work.NlpProcessing</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<directory>conf</directory>
</resource>
</resources>
</build>
错误
spark-submit target/NLPAnnotationController-job.jar -inputTable posts -outputTable posts -batchId 1
java.lang.ClassNotFoundException: com.scryAnalytics.NLPAnnotationController.Work.NlpProcessing
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:278)
at org.apache.spark.util.Utils$.classForName(Utils.scala:174)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:689)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
正如我所说,intellij上的作品完全没问题。 任何帮助将不胜感激。