从Hadoop序列文件创建Spark RDD不起作用

时间:2017-01-17 11:39:40

标签: apache-spark rdd hadoop-streaming

我是Spark新手,我正在尝试从Hadoop序列文件创建一个RDD。但是我收到以下错误。我从在线文章中搜索过它,但未能用给定的解决方案对其进行排序。那么有人可以帮我解决这个问题吗?

我的pom文件是这样的,

  <?xml version="1.0" encoding="UTF-8"?>
  <project xmlns="http://maven.apache.org/POM/4.0.0"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
    <groupId>com.pearson.tellurium</groupId>
    <artifactId>pw-analytics</artifactId>
    <version>1.0-SNAPSHOT</version>
</parent>

<modelVersion>4.0.0</modelVersion>
<artifactId>aggregation-engine</artifactId>
<name>Pearson Writer Analytics - Aggregation Engine</name>
<packaging>jar</packaging>

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <spark.version>1.6.1</spark.version>
    <json4s.version>3.3.0</json4s.version>
    <scala.compat.version>2.10</scala.compat.version>
    <configuration>test</configuration>
</properties>

<dependencies>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.10</artifactId>
        <version>${spark.version}</version>
        <!--<scope>provided</scope>-->
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-hive_2.10</artifactId>
        <version>${spark.version}</version>
        <!--<scope>provided</scope>-->
        <exclusions>
            <exclusion>
                <artifactId>jackson-core-asl</artifactId>
                <groupId>org.codehaus.jackson</groupId>
            </exclusion>
            <exclusion>
                <artifactId>jackson-mapper-asl</artifactId>
                <groupId>org.codehaus.jackson</groupId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>com.fasterxml.jackson.core</groupId>
        <artifactId>jackson-databind</artifactId>
        <version>2.2.4</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-aws</artifactId>
        <version>2.6.4</version>
    </dependency>
    <dependency>
        <groupId>org.json4s</groupId>
        <artifactId>json4s-core_2.10</artifactId>
        <version>${json4s.version}</version>
    </dependency>
    <dependency>
        <groupId>org.json4s</groupId>
        <artifactId>json4s-native_2.10</artifactId>
        <version>${json4s.version}</version>
    </dependency>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.38</version>
    </dependency>
    <dependency>
        <groupId>org.scalatest</groupId>
        <artifactId>scalatest_${scala.compat.version}</artifactId>
        <version>2.2.4</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.12</version>
    </dependency>
    <dependency>
        <groupId>org.mockito</groupId>
        <artifactId>mockito-all</artifactId>
        <version>2.0.2-beta</version>
    </dependency>
</dependencies>

<build>
    <filters>
        <filter>src/main/filters/${configuration}.properties</filter>

    </filters>
    <finalName>pw-aggregation-engine</finalName>

    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-assembly-plugin</artifactId>
            <executions>
                <execution>
                    <id>make-assembly</id>
                    <phase>package</phase>
                    <goals>
                        <goal>single</goal>
                    </goals>
                    <configuration>
                        <descriptorRefs>
                            <descriptorRef>jar-with-dependencies</descriptorRef>
                        </descriptorRefs>
                    </configuration>
                </execution>
                <execution>
                    <id>dist</id>
                    <phase>package</phase>
                    <goals>
                        <goal>single</goal>
                    </goals>
                    <configuration>
                        <descriptors>
                            <descriptor>src/main/assembly/bin.xml</descriptor>
                        </descriptors>
                    </configuration>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <groupId>org.scala-tools</groupId>
            <artifactId>maven-scala-plugin</artifactId>
            <executions>
                <execution>
                    <id>compile</id>
                    <goals>
                        <goal>compile</goal>
                    </goals>
                    <phase>compile</phase>
                </execution>
                <execution>
                    <id>test-compile</id>
                    <goals>
                        <goal>testCompile</goal>
                    </goals>
                    <phase>test-compile</phase>
                </execution>
                <execution>
                    <phase>process-resources</phase>
                    <goals>
                        <goal>compile</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <artifactId>maven-compiler-plugin</artifactId>
            <configuration>
                <source>1.5</source>
                <target>1.5</target>
            </configuration>
        </plugin>
        <plugin>
            <groupId>net.alchim31.maven</groupId>
            <artifactId>scala-maven-plugin</artifactId>
        </plugin>
        <plugin>
            <groupId>org.scalatest</groupId>
            <artifactId>scalatest-maven-plugin</artifactId>
            <configuration>
                <forkMode>once</forkMode>
                <parallel>false</parallel>
                <argLine>-Xmx512M -XX:PermSize=128M -XX:+CMSClassUnloadingEnabled -XX:+CMSClassUnloadingEnabled
                    -XX:MaxPermSize=128M -XX:ReservedCodeCacheSize=512M
                </argLine>
                <reportsDirectory>$outputDirectory/scalatest-reports</reportsDirectory>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>2.4.1</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <minimizeJar>true</minimizeJar>
                        <createDependencyReducedPom>false</createDependencyReducedPom>
                        <artifactSet>
                            <excludes>
                                <exclude>junit:junit</exclude>
                                <exclude>org.aspectj</exclude>
                            </excludes>
                        </artifactSet>
                    </configuration>
                </execution>
            </executions>
            <configuration>
                <filters>
                    <filter>
                        <artifact>*:*</artifact>
                        <excludes>
                            <exclude>META-INF/*.SF</exclude>
                            <exclude>META-INF/*.DSA</exclude>
                            <exclude>META-INF/*.RSA</exclude>
                        </excludes>
                    </filter>
                </filters>
            </configuration>
        </plugin>
        <plugin>
            <artifactId>maven-resources-plugin</artifactId>
            <version>2.7</version>
            <executions>
                <execution>
                    <id>copy-resources</id>
                    <!-- here the phase you need -->
                    <phase>validate</phase>
                    <goals>
                        <goal>copy-resources</goal>
                    </goals>
                    <configuration>
                        <outputDirectory>${basedir}/target/</outputDirectory>
                        <resources>
                            <resource>
                                <directory>src/main/resources</directory>
                                <filtering>true</filtering>

                            </resource>
                        </resources>
                    </configuration>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

我的SparkContext创建就像这样

  def initializeSpark(jobPrefix: String): SparkContext = {
    LOG.info("Initializing Job...")
    LOG.info("Job Name : " + jobPrefix + java.time.LocalDateTime.now.toString)
    val sparkConf = new SparkConf().setAppName(jobPrefix + java.time.LocalDateTime.now.toString).setMaster("local[*]")
    val sparkContext = new SparkContext(sparkConf)
    sparkContext
  }

我有一个已创建的序列文件放在我的test / resource目录中,我正在从该位置读取它

def parseRDD(sparkContext: SparkContext): RDD[String] = {
val filePath = new File("").getAbsolutePath + "/src/test/resources/1_0_00000000000000000218"
val rdd = sparkContext.sequenceFile(filePath, classOf[LongWritable], classOf[BytesWritable])
  .map((hadoopFile: (LongWritable, BytesWritable)) => {
    val bytes = hadoopFile._2.getBytes
    (hadoopFile._1.get(), new String(bytes.slice(0, hadoopFile._2.getLength)))
  }).map(_._2)

def emptyStringRDD(): RDD[String] = {
  sparkContext.parallelize(Seq())
}

val validatedRDD = {
  val r = Try(rdd.first)
  if (!r.isFailure) {
    rdd
  } else {
    LOG.debug("Returning Empty RDD:  " + r.failed.get.getMessage)
    emptyStringRDD()
  }
}
validatedRDD
}

所以我收到以下错误。

   An exception or error caused a run to abort: com.fasterxml.jackson.module.scala.deser.BigDecimalDeserializer$.handledType()Ljava/lang/Class; 
    java.lang.NoSuchMethodError: com.fasterxml.jackson.module.scala.deser.BigDecimalDeserializer$.handledType()Ljava/lang/Class;
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.<init>(ScalaNumberDeserializersModule.scala:49)
at com.fasterxml.jackson.module.scala.deser.NumberDeserializers$.<clinit>(ScalaNumberDeserializersModule.scala)
at com.fasterxml.jackson.module.scala.deser.ScalaNumberDeserializersModule$class.$init$(ScalaNumberDeserializersModule.scala:61)
at com.fasterxml.jackson.module.scala.DefaultScalaModule.<init>(DefaultScalaModule.scala:19)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.<init>(DefaultScalaModule.scala:35)
at com.fasterxml.jackson.module.scala.DefaultScalaModule$.<clinit>(DefaultScalaModule.scala)
at org.apache.spark.rdd.RDDOperationScope$.<init>(RDDOperationScope.scala:81)
at org.apache.spark.rdd.RDDOperationScope$.<clinit>(RDDOperationScope.scala)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:714)
at org.apache.spark.SparkContext.sequenceFile(SparkContext.scala:1166)
at com.pearson.tellurium.analytics.aggregation.MockRDDCreator$.parseRDD(MockRDDCreator.scala:25)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext$$anonfun$3.apply$mcV$sp(TestSparkContext.scala:41)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext$$anonfun$3.apply(TestSparkContext.scala:39)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext$$anonfun$3.apply(TestSparkContext.scala:39)
at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at org.scalatest.FlatSpecLike$$anon$1.apply(FlatSpecLike.scala:1647)
at org.scalatest.Suite$class.withFixture(Suite.scala:1122)
at org.scalatest.FlatSpec.withFixture(FlatSpec.scala:1683)
at org.scalatest.FlatSpecLike$class.invokeWithFixture$1(FlatSpecLike.scala:1644)
at org.scalatest.FlatSpecLike$$anonfun$runTest$1.apply(FlatSpecLike.scala:1656)
at org.scalatest.FlatSpecLike$$anonfun$runTest$1.apply(FlatSpecLike.scala:1656)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
at org.scalatest.FlatSpecLike$class.runTest(FlatSpecLike.scala:1656)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext.org$scalatest$BeforeAndAfter$$super$runTest(TestSparkContext.scala:16)
at org.scalatest.BeforeAndAfter$class.runTest(BeforeAndAfter.scala:200)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext.runTest(TestSparkContext.scala:16)
at org.scalatest.FlatSpecLike$$anonfun$runTests$1.apply(FlatSpecLike.scala:1714)
at org.scalatest.FlatSpecLike$$anonfun$runTests$1.apply(FlatSpecLike.scala:1714)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:390)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:427)
at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483)
at org.scalatest.FlatSpecLike$class.runTests(FlatSpecLike.scala:1714)
at org.scalatest.FlatSpec.runTests(FlatSpec.scala:1683)
at org.scalatest.Suite$class.run(Suite.scala:1424)
at org.scalatest.FlatSpec.org$scalatest$FlatSpecLike$$super$run(FlatSpec.scala:1683)
at org.scalatest.FlatSpecLike$$anonfun$run$1.apply(FlatSpecLike.scala:1760)
at org.scalatest.FlatSpecLike$$anonfun$run$1.apply(FlatSpecLike.scala:1760)
at org.scalatest.SuperEngine.runImpl(Engine.scala:545)
at org.scalatest.FlatSpecLike$class.run(FlatSpecLike.scala:1760)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext.org$scalatest$BeforeAndAfter$$super$run(TestSparkContext.scala:16)
at org.scalatest.BeforeAndAfter$class.run(BeforeAndAfter.scala:241)
at com.pearson.tellurium.analytics.aggregation.TestSparkContext.run(TestSparkContext.scala:16)
at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:55)
at org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2563)
at org.scalatest.tools.Runner$$anonfun$doRunRunRunDaDoRunRun$3.apply(Runner.scala:2557)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:2557)
at org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1044)
at org.scalatest.tools.Runner$$anonfun$runOptionallyWithPassFailReporter$2.apply(Runner.scala:1043)
at org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:2722)
at org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:1043)
at org.scalatest.tools.Runner$.run(Runner.scala:883)
at org.scalatest.tools.Runner.run(Runner.scala)
at org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:138)
at org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:28)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)

1 个答案:

答案 0 :(得分:1)

我能够解决这个问题。实际上它是由jackson库的不兼容版本引起的。我使用spark 1.6版本和Hadoop 2.6。我使用过jackson-databind版本2.4.4。这完全解决了我的问题。以下依赖关系解决了不兼容问题。以前我使用2.2.4版本来解决这个问题。

   <dependency>
        <groupId>com.fasterxml.jackson.core</groupId>
        <artifactId>jackson-databind</artifactId>
        <version>2.4.4</version>
    </dependency>