我正在尝试使用Spark Streaming加入多个JSON文本文件。我们的想法是解析JSON创建临时表,一旦所有3个临时表都可用,然后加入它们。但是,当我尝试尝试加入时,作业失败了NullPointerException
。
这里是完整的代码段。我正在使用Spark-Submit执行此代码。 Spark的版本是2.1.0。
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.struct;
import java.util.Arrays;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.log4j.Logger;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.collection.Seq;
public class StreamingConsumer {
final static transient Logger logger = Logger.getLogger(StreamingConsumer.class);
public static void main(String[] args) {
logger.info("Starting spark job to merge the datasets");
SparkConf config = new SparkConf().setAppName("HDFS Streaming Job");
JavaStreamingContext jsc = new JavaStreamingContext(config, new Duration(100));
JavaPairInputDStream<LongWritable, Text> fileLines = jsc.fileStream(args[0], LongWritable.class, Text.class,
TextInputFormat.class);
JavaDStream<String> dstream = fileLines.map(line -> {
logger.info(line._2.toString());
return line._2.toString();
});
Seq<String> joinColumn = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList("partyId"));
dstream.foreachRDD(rdd -> {
SparkSession session = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
if (!rdd.isEmpty()) {
logger.info("Processing RDD");
Dataset<Row> dataset = session.read().json(rdd);
Dataset<Row> inventory = null;
Dataset<Row> contracts = null;
Dataset<Row> alerts = null;
if (dataset.select("recordType").equals("PanResult")) {
dataset.select(col("partyId"), col("sourceNeId"), struct("neAlert").as("ALERT"));
logger.info("Created Alerts ---------------- ");
dataset.createOrReplaceTempView("alerts");
alerts = session.sql("select * from alerts");
alerts.show();
} else if (dataset.select("recordType").equals("ContractCoverageNeResult")) {
dataset.select(col("partyId"), col("sourceNeId"), struct("neContract").as("CONTRACT"));
logger.info("Created Contracts ---------------- ");
dataset.createOrReplaceTempView("contracts");
contracts = session.sql("select * from contracts");
contracts.show();
} else if (dataset.select("recordType").equals("NeInventoryResult")) {
dataset.createOrReplaceTempView("inventory");
inventory = session.sql("select * from inventory");
inventory.show();
}
if (session.catalog().tableExists("alerts") && session.catalog().tableExists("contracts")
&& session.catalog().tableExists("inventory")) {
logger.info("Inside the IF clause");
Dataset<Row> finaljson = inventory.join(alerts, joinColumn, "inner").join(contracts, joinColumn,
"inner");
finaljson.write().mode(SaveMode.Overwrite).json("/user/creando/CDX_Merge");
finaljson.printSchema();
} else {
logger.info("Inside the ELSE clause");
}
}
});
try {
jsc.start();
jsc.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
jsc.stop();
jsc.close();
}
}
}
class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession.builder().config(sparkConf).getOrCreate();
}
return instance;
}
}
例外
2018-04-10 09:09:53,987 ERROR [JobScheduler] scheduler.JobScheduler: Error running job streaming job 1523351390000 ms.0
java.lang.NullPointerException
at com.cisco.sdp.cdx.HadoopStreaming.StreamingConsumer.lambda$1(StreamingConsumer.java:60)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:253)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
在此处发布问题之前尝试了不同的步骤。
Dataset.printSchema()
解析JSON。我能够在日志中看到架构。以下是架构
Inventory:
root
|-- Equipment: struct (nullable = true)
|-- LicenseActivated: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- NetworkElement: struct (nullable = true)
| | | | |-- udiProductIdentifier: string (nullable = true)
|-- Versions: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- collectorId: string (nullable = true)
|-- generatedAt: long (nullable = true)
|-- managedNeId: string (nullable = true)
|-- partyId: string (nullable = true)
|-- recordType: string (nullable = true)
|-- sourceNeId: string (nullable = true)
|-- sourcePartyId: string (nullable = true)
|-- sourceSubPartyId: string (nullable = true)
|-- wfid: string (nullable = true)
Contracts:
root
|-- partyId: string (nullable = true)
|-- sourceNeId: string (nullable = true)
|-- CONTRACT: struct (nullable = false)
| |-- neContract: struct (nullable = true)
Alert:
root
|-- partyId: string (nullable = true)
|-- sourceNeId: string (nullable = true)
|-- ALERT: struct (nullable = false)
| |-- neAlert: struct (nullable = true)
我无法验证的一件事是,是否存在创建的临时视图。请帮忙。
编辑:
我看到了这个特定代码段的问题。这是检查表存在的正确方法。
我尝试了多种方法来检查表的存在。我尝试将df.rdd().isEmpty()
与df.head().isEmpty()
一起使用,最后使用以下逻辑着陆。
if (session.catalog().tableExists("alerts") && session.catalog().tableExists("contracts")
&& session.catalog().tableExists("inventory")) {
logger.info("Inside the IF clause");
Dataset<Row> finaljson = inventory.join(alerts, joinColumn, "inner").join(contracts, joinColumn,
"inner");
finaljson.write().mode(SaveMode.Overwrite).json("/user/creando/CDX_Merge");
finaljson.printSchema();
} else {
logger.info("Inside the ELSE clause");
}