org.apache.spark.rdd.NewHadoopRDD - 无法使用InputSplit#getLocationInfo

时间:2017-04-11 07:18:03

标签: apache-spark hbase apache-zookeeper

我正在使用Java(8)将spark(1.6.0)与hbase(1.2.2 zookeeper 3.4.6)连接起来。 ,spark客户端上的scala代码是可以的。 Spark集群,hbase集群,zookeeper集群都在云端。代码如下:

package com.carelinker.spark;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.mesos.protobuf.ServiceException;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class SparktestApplication {
    public static void main(String[] args) throws ServiceException {
        //HBase config
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.property.clientPort", "2181");
        conf.set("hbase.zookeeper.quorum", "192.168.100.5:2181,192.168.100.3:2181,192.168.100.6:2181");
        conf.set("zookeeper.znode.parent", "/hbase/hbs-frz2bnnm");
        conf.set(TableInputFormat.INPUT_TABLE, "test");
        //Spark config
        SparkContext sc = new SparkContext("spark://skn-1w3zsyz0-spark-master:7077", "HBaseRead");
        sc.setLocalProperty("spark.executor.extraClassPath","/usr/local/hbase/lib/*");
        sc.setLocalProperty("spark.driver.extraClassPath","/usr/local/hbase/lib/*");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sc);
        JavaPairRDD<ImmutableBytesWritable, Result> myRDD = javaSparkContext.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
        System.out.println(myRDD.count());
    }
}

日志一直显示&#34; 15:04:46.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName :, name:TaskSet_0,runningTasks:0&#34;并且在此之前有一个例外,如下所示:

15:03:46.380 [dag-scheduler-event-loop] DEBUG org.apache.spark.rdd.NewHadoopRDD - Failed to use InputSplit#getLocationInfo.
java.lang.NullPointerException: null
    at scala.collection.mutable.ArrayOps$ofRef$.length$extension(ArrayOps.scala:114)
    at scala.collection.mutable.ArrayOps$ofRef.length(ArrayOps.scala:114)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:32)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at org.apache.spark.rdd.HadoopRDD$.convertSplitLocationInfo(HadoopRDD.scala:412)
    at org.apache.spark.rdd.NewHadoopRDD.getPreferredLocations(NewHadoopRDD.scala:233)
    at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
    at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.preferredLocations(RDD.scala:256)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$getPreferredLocsInternal(DAGScheduler.scala:1545)
    at org.apache.spark.scheduler.DAGScheduler.getPreferredLocs(DAGScheduler.scala:1519)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:974)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:972)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:972)
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:921)
    at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
15:03:46.417 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1 stored as values in memory (estimated size 1688.0 B, free 209.3 KB)
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1 locally took  2 ms
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1 without replication took  3 ms
15:03:46.424 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1_piece0 stored as bytes in memory (estimated size 1076.0 B, free 210.3 KB)
15:03:46.424 [dispatcher-event-loop-2] INFO org.apache.spark.storage.BlockManagerInfo - Added broadcast_1_piece0 in memory on 192.168.56.1:54197 (size: 1076.0 B, free: 1800.4 MB)
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManagerMaster - Updated info of block broadcast_1_piece0
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Told master about block broadcast_1_piece0
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1_piece0 locally took  3 ms
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1_piece0 without replication took  4 ms
15:03:46.425 [dag-scheduler-event-loop] INFO org.apache.spark.SparkContext - Created broadcast 1 from broadcast at DAGScheduler.scala:1006
15:03:46.428 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Submitting 1 missing tasks from ResultStage 0 (NewHadoopRDD[0] at newAPIHadoopRDD at SparktestApplication.java:28)
15:03:46.429 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.DAGScheduler - New pending partitions: Set(0)
15:03:46.430 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Adding task set 0.0 with 1 tasks
15:03:46.434 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Epoch for TaskSet 0.0: 0
15:03:46.445 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Valid locality levels for TaskSet 0.0: ANY
15:03:46.462 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:46.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:47.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:48.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:49.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:50.973 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:51.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:52.975 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:53.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:54.975 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:55.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:56.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:57.973 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:58.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0

我已经搜索了这个问题,因为Spark give Null pointer exception during InputSplit for Hbase显示:配置hbase.master但它没有帮助我。

我有以下问题:

  1. 这个例外是什么意思?
  2. 为什么parentName日志会一直显示而且没有结果?
  3. 感谢您的帮助。

    以下是maven pom文件的详细信息:

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.carelinker.spark</groupId>
        <artifactId>sparktest</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <packaging>jar</packaging>
    
        <name>sparktest</name>
        <description>Demo project for Spring Boot</description>
    
        <parent>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-parent</artifactId>
            <version>1.5.2.RELEASE</version>
            <relativePath/> <!-- lookup parent from repository -->
        </parent>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
            <java.version>1.8</java.version>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-client</artifactId>
                <version>1.2.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-server</artifactId>
                <version>1.2.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_2.10</artifactId>
                <version>1.6.0</version>
                <exclusions>
                    <exclusion>
                        <groupId>org.eclipse.jetty.orbit</groupId>
                        <artifactId>javax.servlet</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-sql_2.10</artifactId>
                <version>1.6.0</version>
            </dependency>
            <dependency>
                <groupId>javax.servlet</groupId>
                <artifactId>javax.servlet-api</artifactId>
                <version>3.1.0</version>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-tomcat</artifactId>
                <!--<scope>provided</scope>-->
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>
            </dependency>
        </dependencies>
    
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                </plugin>
            </plugins>
        </build>
    
    
    </project>
    

0 个答案:

没有答案