使用Spark Streaming使用临时视图加入Dataframe会抛出NullPointerException

时间:2018-04-10 09:50:39

标签: java json apache-spark spark-streaming

我正在尝试使用Spark Streaming加入多个JSON文本文件。我们的想法是解析JSON创建临时表,一旦所有3个临时表都可用,然后加入它们。但是,当我尝试尝试加入时,作业失败了NullPointerException

这里是完整的代码段。我正在使用Spark-Submit执行此代码。 Spark的版本是2.1.0。

import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.struct;

import java.util.Arrays;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.log4j.Logger;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.collection.Seq;

public class StreamingConsumer {

    final static transient Logger logger = Logger.getLogger(StreamingConsumer.class);

    public static void main(String[] args) {
        logger.info("Starting spark job to merge the datasets");
        SparkConf config = new SparkConf().setAppName("HDFS Streaming Job");
        JavaStreamingContext jsc = new JavaStreamingContext(config, new Duration(100));
        JavaPairInputDStream<LongWritable, Text> fileLines = jsc.fileStream(args[0], LongWritable.class, Text.class,
                TextInputFormat.class);
        JavaDStream<String> dstream = fileLines.map(line -> {
            logger.info(line._2.toString());
            return line._2.toString();
        });

        Seq<String> joinColumn = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList("partyId"));

        dstream.foreachRDD(rdd -> {
            SparkSession session = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            if (!rdd.isEmpty()) {
                logger.info("Processing RDD");
                Dataset<Row> dataset = session.read().json(rdd);
                Dataset<Row> inventory = null;
                Dataset<Row> contracts = null;
                Dataset<Row> alerts = null;
                if (dataset.select("recordType").equals("PanResult")) {
                    dataset.select(col("partyId"), col("sourceNeId"), struct("neAlert").as("ALERT"));
                    logger.info("Created Alerts ---------------- ");
                    dataset.createOrReplaceTempView("alerts");
                    alerts = session.sql("select * from alerts");
                    alerts.show();
                } else if (dataset.select("recordType").equals("ContractCoverageNeResult")) {
                    dataset.select(col("partyId"), col("sourceNeId"), struct("neContract").as("CONTRACT"));
                    logger.info("Created Contracts ---------------- ");
                    dataset.createOrReplaceTempView("contracts");
                    contracts = session.sql("select * from contracts");
                    contracts.show();
                } else if (dataset.select("recordType").equals("NeInventoryResult")) {
                    dataset.createOrReplaceTempView("inventory");
                    inventory = session.sql("select * from inventory");
                    inventory.show();
                }

                if (session.catalog().tableExists("alerts") && session.catalog().tableExists("contracts")
                        && session.catalog().tableExists("inventory")) {

                    logger.info("Inside the IF clause");

                    Dataset<Row> finaljson = inventory.join(alerts, joinColumn, "inner").join(contracts, joinColumn,
                            "inner");
                    finaljson.write().mode(SaveMode.Overwrite).json("/user/creando/CDX_Merge");
                    finaljson.printSchema();

                } else {
                    logger.info("Inside the ELSE clause");
                }
            }
        });

        try {
            jsc.start();
            jsc.awaitTermination();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            jsc.stop();
            jsc.close();
        }
    }

}

class JavaSparkSessionSingleton {
    private static transient SparkSession instance = null;

    public static SparkSession getInstance(SparkConf sparkConf) {
        if (instance == null) {
            instance = SparkSession.builder().config(sparkConf).getOrCreate();
        }
        return instance;
    }
}

例外

2018-04-10 09:09:53,987 ERROR [JobScheduler] scheduler.JobScheduler: Error running job streaming job 1523351390000 ms.0
java.lang.NullPointerException
        at com.cisco.sdp.cdx.HadoopStreaming.StreamingConsumer.lambda$1(StreamingConsumer.java:60)
        at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
        at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
        at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
        at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:627)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
        at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
        at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
        at scala.util.Try$.apply(Try.scala:192)
        at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:254)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:254)
        at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
        at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:253)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

在此处发布问题之前尝试了不同的步骤。

  • 验证了所有JSON的内容。它们是有效的。
  • 验证HDFS目录是否存在以写入最终的JSON。它存在。
  • 验证是否使用Dataset.printSchema()解析JSON。我能够在日志中看到架构。

以下是架构

Inventory:

    root
     |-- Equipment: struct (nullable = true)
     |-- LicenseActivated: struct (nullable = true)
     |    |-- items: array (nullable = true)
     |    |    |-- element: string (containsNull = true)
     |-- NetworkElement: struct (nullable = true)
     |    |    |    |    |-- udiProductIdentifier: string (nullable = true)
     |-- Versions: struct (nullable = true)
     |    |-- items: array (nullable = true)
     |    |    |-- element: string (containsNull = true)
     |-- collectorId: string (nullable = true)
     |-- generatedAt: long (nullable = true)
     |-- managedNeId: string (nullable = true)
     |-- partyId: string (nullable = true)
     |-- recordType: string (nullable = true)
     |-- sourceNeId: string (nullable = true)
     |-- sourcePartyId: string (nullable = true)
     |-- sourceSubPartyId: string (nullable = true)
     |-- wfid: string (nullable = true)


    Contracts:

    root
     |-- partyId: string (nullable = true)
     |-- sourceNeId: string (nullable = true)
     |-- CONTRACT: struct (nullable = false)
     |    |-- neContract: struct (nullable = true)

Alert:

    root
     |-- partyId: string (nullable = true)
     |-- sourceNeId: string (nullable = true)
     |-- ALERT: struct (nullable = false)
     |    |-- neAlert: struct (nullable = true)

我无法验证的一件事是,是否存在创建的临时视图。请帮忙。

编辑:

我看到了这个特定代码段的问题。这是检查表存在的正确方法。

我尝试了多种方法来检查表的存在。我尝试将df.rdd().isEmpty()df.head().isEmpty()一起使用,最后使用以下逻辑着陆。

if (session.catalog().tableExists("alerts") && session.catalog().tableExists("contracts")
                        && session.catalog().tableExists("inventory")) {

                    logger.info("Inside the IF clause");

                    Dataset<Row> finaljson = inventory.join(alerts, joinColumn, "inner").join(contracts, joinColumn,
                            "inner");
                    finaljson.write().mode(SaveMode.Overwrite).json("/user/creando/CDX_Merge");
                    finaljson.printSchema();

                } else {
                    logger.info("Inside the ELSE clause");
                }

0 个答案:

没有答案