无法在spark-submit上运行代码

时间:2016-12-07 19:15:11

标签: scala maven apache-spark

我有这个scala代码,我希望使用 spark-submit 命令在终端上运行。在intellij IDE中运行它似乎没有问题。

代码

package com.scryAnalytics.NLPAnnotationController.Work

import java.net.MalformedURLException
import java.util.{ArrayList, Arrays}

import com.scryAnalytics.NLPAnnotationController.Configuration.{VOCPConstants, VocpConfiguration}
import com.scryAnalytics.NLPAnnotationController.DAO.NLPEntitiesDAO
import com.scryAnalytics.NLPGeneric.{NLPEntities, _}
import com.vocp.ner.main.GateNERImpl
import gate.util.GateException
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HTableDescriptor, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}

class NLPProcessingLog {
var log: Logger = Logger.getLogger(classOf[NLPProcessingLog])
log.info("Logger Initialized .....")
}

object NlpProcessing {

 val logger = new NLPProcessingLog

 @throws(classOf[Exception])
 def nlpAnnotationExtraction(conf: org.apache.hadoop.conf.Configuration, batchString: String): Int = {

logger.log.info("In Main Object..")

//Initializing Spark Context 
val sc = new SparkContext(new SparkConf().setAppName("NLPAnnotationController").setMaster("local"))

val batchId =
  if (batchString == "newbatch")
    java.lang.Long.toString(System.currentTimeMillis())
  else batchString

conf.set("batchId", batchId)

val inputCfs = Arrays.asList(conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(","): _*)

try {

  conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
  conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))

  val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
  job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
  job.setOutputFormatClass(classOf[MultiTableOutputFormat])

  val admin = new HBaseAdmin(conf)
  if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
    val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
    admin.createTable(tableDesc)
  }

  val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
    classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
    classOf[org.apache.hadoop.hbase.client.Result])

  val processedFilteredRDD = hBaseRDD.map(x => x._2).filter { result =>
    val flag = Bytes.toString(result.getValue(Bytes.toBytes("f"),
      Bytes.toBytes("is_processed")))
    (flag == null) || (flag == 0)
  }

  println(processedFilteredRDD.count())
  val messageRDD = processedFilteredRDD.filter { x => x != null }.map { result =>
    val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
      Bytes.toBytes("message")))
    (Bytes.toString(result.getRow()), message)

  }

  println("Number of partitions " + messageRDD.getNumPartitions)

  val pluginHome = conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE)
  val requiredNLPEntities = new ArrayList[NLPEntities]()
  requiredNLPEntities.add(NLPEntities.POS_TAGGER)
  requiredNLPEntities.add(NLPEntities.VP_CHUNKER)
  requiredNLPEntities.add(NLPEntities.NP_CHUNKER)

  val nlpGenericRDD = messageRDD.mapPartitions { iter =>
    val nlpModule = new GateGenericNLP(pluginHome, requiredNLPEntities)
    iter.map { x =>
      val nlpGenericJson = nlpModule.generateNLPEntities(x._2)
      val genericNLPObject = Utility.jsonToGenericNLP(nlpGenericJson)
      (x._1, x._2, genericNLPObject)

    }
  }

  val requiredNEREntities = new ArrayList[String]()
  requiredNEREntities.add("DRUG")
  requiredNEREntities.add("SE")
  requiredNEREntities.add("REG")
  requiredNEREntities.add("ALT_THERAPY")
  requiredNEREntities.add("ALT_DRUG")

  val nlpRDD = nlpGenericRDD.mapPartitions { iter =>
    val nerModule = new GateNERImpl(pluginHome, requiredNEREntities)
    iter.map { x =>
      val nerJson = nerModule.generateNER(x._2, Utility.objectToJson(x._3))
      val nerJsonObject = Utility.jsonToGateNer(nerJson)

      val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
      nlpEntities.setToken(x._3.getToken())
      nlpEntities.setSpaceToken(x._3.getSpaceToken())
      nlpEntities.setSentence(x._3.getSentence())
      nlpEntities.setSplit(x._3.getSplit())
      nlpEntities.setVG(x._3.getVG)
      nlpEntities.setNounChunk(x._3.getNounChunk)

      nlpEntities.setDRUG(nerJsonObject.getDRUG())
      nlpEntities.setREG(nerJsonObject.getREG())
      nlpEntities.setSE(nerJsonObject.getSE())
      nlpEntities.setALT_DRUG(nerJsonObject.getALT_DRUG())
      nlpEntities.setALT_THERAPY(nerJsonObject.getALT_THERAPY())
      (x._1, nlpEntities)
    }
  }

  //outputRDD.foreach(println)

  val newRDD = nlpRDD.map { k => convertToPut(k) }
  newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
  return 0

} catch {
  case e: MalformedURLException => {
    e.printStackTrace()
    return 1
  }
  case e: GateException =>
    {
      e.printStackTrace()
      return 1
    }

}
   }

  def convertToPut(genericNlpWithRowKey: (String, NLPEntitiesDAO)): (ImmutableBytesWritable, Put) = {
val rowkey = genericNlpWithRowKey._1
val genericNLP = genericNlpWithRowKey._2
val put = new Put(Bytes.toBytes(rowkey))
val genCFDataBytes = Bytes.toBytes("gen")
val nerCFDataBytes = Bytes.toBytes("ner")
val flagCFDataBytes = Bytes.toBytes("f")

put.add(genCFDataBytes, Bytes.toBytes("token"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getToken()))));
put.add(genCFDataBytes, Bytes.toBytes("spaceToken"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getSpaceToken()))));
put.add(genCFDataBytes, Bytes.toBytes("sentence"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getSentence()))));
put.add(genCFDataBytes, Bytes.toBytes("verbGroup"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getVG()))));
put.add(genCFDataBytes, Bytes.toBytes("split"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getSplit()))));
put.add(genCFDataBytes, Bytes.toBytes("nounChunk"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getNounChunk()))));

put.add(nerCFDataBytes, Bytes.toBytes("drug"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getDRUG()))))
put.add(nerCFDataBytes, Bytes.toBytes("sideEffect"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getSE()))))
put.add(nerCFDataBytes, Bytes.toBytes("regimen"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getREG()))))
put.add(nerCFDataBytes, Bytes.toBytes("altTherapy"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getALT_THERAPY()))))
put.add(nerCFDataBytes, Bytes.toBytes("altDrug"),
  Bytes.toBytes(Utility.objectToJson((genericNLP.getALT_DRUG()))))

put.add(flagCFDataBytes, Bytes.toBytes("is_processed"),
  Bytes.toBytes("1"))
put.add(flagCFDataBytes, Bytes.toBytes("dStatus"),
  Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("rStatus"),
  Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("adStatus"),
  Bytes.toBytes("0"))
put.add(flagCFDataBytes, Bytes.toBytes("atStatus"),
  Bytes.toBytes("0"))

(new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put)

 }

 def pipeLineExecute(args: Array[String]): Int = {

var batchString = ""
val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
  " -batchId batchId / -newbatch \n"
if (args.length == 0) {
  System.err.println(usage)
  return -1
}

val conf = VocpConfiguration.create
for (i <- 0 until args.length by 2) {
  if ("-inputTable" == args(i)) {
    conf.set(VOCPConstants.INPUTTABLE, args(i + 1))
  } else if ("-outputTable" == args(i)) {
    conf.set(VOCPConstants.OUTPUTTABLE, args(i + 1))
  } else if ("-batchId" == args(i)) {
    batchString = args(i)
  } else if ("-newbatch" == args(i)) {
    batchString = "newbatch"
  } else {
    throw new IllegalArgumentException("arg " + args(i) + " not recognized")
   }
   }
val result = nlpAnnotationExtraction(conf, batchString)
result

  } 

 def main(args: Array[String]) {
val res = pipeLineExecute(args)
System.exit(res)
}

}

我正在尝试使用spark-submit创建一个胖jar文件。 的pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.scryAnalytics</groupId>
<artifactId>NLPAnnotationController</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>

<name>NLPAnnotationController2</name>
<url>http://maven.apache.org</url>

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <hadoop.version>2.6.0-cdh5.7.2</hadoop.version>
    <jdk.version>1.7</jdk.version>
    <sdk.version>2.10.5</sdk.version>
    <hbase.version>0.98.16-hadoop2</hbase.version>
</properties>

<repositories>
    <repository>
        <id>cloudera</id>
        <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
    </repository>
</repositories>

<pluginRepositories>
    <pluginRepository>
        <id>scala-tools.org</id>
        <name>Scala-tools Maven2 Repository</name>
        <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
</pluginRepositories>

<dependencies>
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>2.10.5</version>
    </dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.10</artifactId>
        <version>1.6.1</version>
    </dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.10</artifactId>
        <version>1.6.1</version>
    </dependency>

    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-client</artifactId>
        <version>${hbase.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-common</artifactId>
        <version>${hbase.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-spark</artifactId>
        <version>1.2.0-cdh5.7.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hbase</groupId>
        <artifactId>hbase-server</artifactId>
        <version>${hbase.version}</version>
    </dependency>

    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit-dep</artifactId>
        <version>4.8.2</version>
    </dependency>

    <dependency>
        <groupId>uk.ac.gate</groupId>
        <artifactId>gate-core</artifactId>
        <version>8.1</version>
    </dependency>
    <dependency>
        <groupId>uk.ac.gate</groupId>
        <artifactId>gate-compiler-jdt</artifactId>
        <version>4.3.2-P20140317-1600</version>
    </dependency>

    <dependency>
        <groupId>com.thoughtworks.xstream</groupId>
        <artifactId>xstream</artifactId>
        <version>1.4.8</version>
    </dependency>

    <dependency>
        <groupId>org.codehaus.jackson</groupId>
        <artifactId>jackson-core-asl</artifactId>
        <version>1.9.13</version>
    </dependency>

    <dependency>
        <groupId>org.codehaus.jackson</groupId>
        <artifactId>jackson-mapper-asl</artifactId>
        <version>1.9.13</version>
    </dependency>

    <dependency>
        <groupId>com.scryAnalytics</groupId>
        <artifactId>NLPGeneric</artifactId>
        <version>1.1</version>
    </dependency>

    <dependency>
        <groupId>NER</groupId>
        <artifactId>NER</artifactId>
        <version>1.2</version>
    </dependency>
</dependencies>

<build>
    <finalName>NLPAnnotationController</finalName>
    <plugins>

        <!-- download source code in Eclipse, best practice -->
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-eclipse-plugin</artifactId>
            <version>2.9</version>
            <configuration>
                <downloadSources>true</downloadSources>
                <downloadJavadocs>false</downloadJavadocs>
            </configuration>
        </plugin>

        <!-- Set a compiler level -->
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.3</version>
            <configuration>
                <source>${jdk.version}</source>
                <target>${jdk.version}</target>
            </configuration>
        </plugin>

        <!-- Maven Assembly Plugin -->
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-assembly-plugin</artifactId>
            <version>2.4.1</version>
            <configuration>
                <!-- get all project dependencies -->
                <descriptors>
                    <descriptor>src/main/assembly/hadoop-job.xml</descriptor>
                </descriptors>
                <descriptorRefs>
                    <descriptorRef>jar-with-dependencies</descriptorRef>
                </descriptorRefs>
                <!-- MainClass in mainfest make a executable jar -->
                <archive>
                    <manifest>
                        <mainClass>com.scryAnalytics.NLPAnnotationController.Work.NlpProcessing</mainClass>
                    </manifest>
                </archive>

            </configuration>
            <executions>
                <execution>
                    <id>make-assembly</id>
                    <!-- bind to the packaging phase -->
                    <phase>package</phase>
                    <goals>
                        <goal>single</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>

    </plugins>
    <resources>
        <resource>
            <directory>conf</directory>
        </resource>
    </resources>
</build>

错误

spark-submit target/NLPAnnotationController-job.jar -inputTable posts -outputTable posts -batchId 1 
java.lang.ClassNotFoundException:     com.scryAnalytics.NLPAnnotationController.Work.NlpProcessing
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:278)
at org.apache.spark.util.Utils$.classForName(Utils.scala:174)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:689)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

正如我所说,intellij上的作品完全没问题。 任何帮助将不胜感激。

0 个答案:

没有答案