我正在使用Java(8)将spark(1.6.0)与hbase(1.2.2 zookeeper 3.4.6)连接起来。 ,spark客户端上的scala代码是可以的。 Spark集群,hbase集群,zookeeper集群都在云端。代码如下:
package com.carelinker.spark;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.mesos.protobuf.ServiceException;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class SparktestApplication {
public static void main(String[] args) throws ServiceException {
//HBase config
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set("hbase.zookeeper.quorum", "192.168.100.5:2181,192.168.100.3:2181,192.168.100.6:2181");
conf.set("zookeeper.znode.parent", "/hbase/hbs-frz2bnnm");
conf.set(TableInputFormat.INPUT_TABLE, "test");
//Spark config
SparkContext sc = new SparkContext("spark://skn-1w3zsyz0-spark-master:7077", "HBaseRead");
sc.setLocalProperty("spark.executor.extraClassPath","/usr/local/hbase/lib/*");
sc.setLocalProperty("spark.driver.extraClassPath","/usr/local/hbase/lib/*");
JavaSparkContext javaSparkContext = new JavaSparkContext(sc);
JavaPairRDD<ImmutableBytesWritable, Result> myRDD = javaSparkContext.newAPIHadoopRDD(conf, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
System.out.println(myRDD.count());
}
}
日志一直显示&#34; 15:04:46.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName :, name:TaskSet_0,runningTasks:0&#34;并且在此之前有一个例外,如下所示:
15:03:46.380 [dag-scheduler-event-loop] DEBUG org.apache.spark.rdd.NewHadoopRDD - Failed to use InputSplit#getLocationInfo.
java.lang.NullPointerException: null
at scala.collection.mutable.ArrayOps$ofRef$.length$extension(ArrayOps.scala:114)
at scala.collection.mutable.ArrayOps$ofRef.length(ArrayOps.scala:114)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:32)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at org.apache.spark.rdd.HadoopRDD$.convertSplitLocationInfo(HadoopRDD.scala:412)
at org.apache.spark.rdd.NewHadoopRDD.getPreferredLocations(NewHadoopRDD.scala:233)
at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
at org.apache.spark.rdd.RDD$$anonfun$preferredLocations$2.apply(RDD.scala:257)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.preferredLocations(RDD.scala:256)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$getPreferredLocsInternal(DAGScheduler.scala:1545)
at org.apache.spark.scheduler.DAGScheduler.getPreferredLocs(DAGScheduler.scala:1519)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:974)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$15.apply(DAGScheduler.scala:972)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:972)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:921)
at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
15:03:46.417 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1 stored as values in memory (estimated size 1688.0 B, free 209.3 KB)
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1 locally took 2 ms
15:03:46.420 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1 without replication took 3 ms
15:03:46.424 [dag-scheduler-event-loop] INFO org.apache.spark.storage.MemoryStore - Block broadcast_1_piece0 stored as bytes in memory (estimated size 1076.0 B, free 210.3 KB)
15:03:46.424 [dispatcher-event-loop-2] INFO org.apache.spark.storage.BlockManagerInfo - Added broadcast_1_piece0 in memory on 192.168.56.1:54197 (size: 1076.0 B, free: 1800.4 MB)
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManagerMaster - Updated info of block broadcast_1_piece0
15:03:46.424 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Told master about block broadcast_1_piece0
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Put block broadcast_1_piece0 locally took 3 ms
15:03:46.425 [dag-scheduler-event-loop] DEBUG org.apache.spark.storage.BlockManager - Putting block broadcast_1_piece0 without replication took 4 ms
15:03:46.425 [dag-scheduler-event-loop] INFO org.apache.spark.SparkContext - Created broadcast 1 from broadcast at DAGScheduler.scala:1006
15:03:46.428 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.DAGScheduler - Submitting 1 missing tasks from ResultStage 0 (NewHadoopRDD[0] at newAPIHadoopRDD at SparktestApplication.java:28)
15:03:46.429 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.DAGScheduler - New pending partitions: Set(0)
15:03:46.430 [dag-scheduler-event-loop] INFO org.apache.spark.scheduler.TaskSchedulerImpl - Adding task set 0.0 with 1 tasks
15:03:46.434 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Epoch for TaskSet 0.0: 0
15:03:46.445 [dag-scheduler-event-loop] DEBUG org.apache.spark.scheduler.TaskSetManager - Valid locality levels for TaskSet 0.0: ANY
15:03:46.462 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:46.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:47.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:48.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:49.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:50.973 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:51.973 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:52.975 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:53.974 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:54.975 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:55.974 [dispatcher-event-loop-1] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:56.974 [dispatcher-event-loop-2] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:57.973 [dispatcher-event-loop-3] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
15:03:58.974 [dispatcher-event-loop-0] DEBUG org.apache.spark.scheduler.TaskSchedulerImpl - parentName: , name: TaskSet_0, runningTasks: 0
我已经搜索了这个问题,因为Spark give Null pointer exception during InputSplit for Hbase显示:配置hbase.master但它没有帮助我。
我有以下问题:
感谢您的帮助。
以下是maven pom文件的详细信息:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.carelinker.spark</groupId>
<artifactId>sparktest</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>sparktest</name>
<description>Demo project for Spring Boot</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.2.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.0</version>
<exclusions>
<exclusion>
<groupId>org.eclipse.jetty.orbit</groupId>
<artifactId>javax.servlet</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-tomcat</artifactId>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>