Question

使用xmlInputFormat的Spark流识别输入文件中的xml记录。

但是火花工作者因org.apache.hadoop.io.LongWritable可序列化错误而失败

object XmlStreaming {

    def main(args: Array[String]) {
        println("start of program");
        if (args.length < 1) {
            System.err.println("Usage: XmlStreaming <directory>")
            System.exit(1)
        }
        val sparkConf = new SparkConf().setAppName("XmlStreaming")
        val sc = new SparkContext(sparkConf);
        val ssc = new StreamingContext(sc, Seconds(30));

        val windowDStream = ssc.fileStream[LongWritable, Text, XmlInputFormat](args(0), (x: Path) => true, true);

        windowDStream.foreachRDD { IncomingFiles => 

        println("Interval data processing "+Calendar.getInstance().getTime());
        if(IncomingFiles.count() == 0) {
            println("No files received in this interval")
        } else {
            //        println("1st line:"+IncomingFiles.take(5).deep.mkString("\n"));

            println(IncomingFiles.count() + " files received in this interval");

            IncomingFiles.collect().foreach(println);

            println("end of processing");        

        }
        }
        ssc.start()
        ssc.awaitTermination()
    }
}

XmlInputFormat

输入xml文件开始标记和结束标记是硬编码的。

public class XmlInputFormat extends TextInputFormat {

  private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class);

//  public static final String START_TAG_KEY = "xmlinput.start";
//  public static final String END_TAG_KEY = "xmlinput.end";

  public static final String START_TAG_KEY = "<catalog>";
  public static final String END_TAG_KEY = "</catalog>";

  @Override
  public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
    try {
      return new XmlRecordReader((FileSplit) split, context.getConfiguration());
    } catch (IOException ioe) {
      log.warn("Error while creating XmlRecordReader", ioe);
      return null;
    }
  }

  /**
   * XMLRecordReader class to read through a given xml document to output xml blocks as records as specified
   * by the start tag and end tag
   * 
   */
  public static class XmlRecordReader extends RecordReader<LongWritable, Text> {

    private final byte[] startTag;
    private final byte[] endTag;
    private final long start;
    private final long end;
    private final FSDataInputStream fsin;
    private final DataOutputBuffer buffer = new DataOutputBuffer();
    private LongWritable currentKey;
    private Text currentValue;

    public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
//      startTag = conf.get(START_TAG_KEY).getBytes(StandardCharsets.UTF_8);
//      endTag = conf.get(END_TAG_KEY).getBytes(StandardCharsets.UTF_8);

        System.out.println("in XmlRecordReader method");
        startTag = "<catalog>".getBytes(StandardCharsets.UTF_8);
        endTag = "</catalog>".getBytes(StandardCharsets.UTF_8);

      // open the file and seek to the start of the split
      start = split.getStart();
      end = start + split.getLength();
      Path file = split.getPath();
      FileSystem fs = file.getFileSystem(conf);
      fsin = fs.open(split.getPath());
      fsin.seek(start);
    }

    private boolean next(LongWritable key, Text value) throws IOException {
        System.out.println("in next method");
      if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
        try {
          buffer.write(startTag);
          if (readUntilMatch(endTag, true)) {
            key.set(fsin.getPos());
            value.set(buffer.getData(), 0, buffer.getLength());
            return true;
          }
        } finally {
          buffer.reset();
        }
      }
      return false;
    }

    @Override
    public void close() throws IOException {
      fsin.close();
    }

    @Override
    public float getProgress() throws IOException {
      return (fsin.getPos() - start) / (float) (end - start);
    }

    private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
        System.out.println("in readUntilMatch method");
      int i = 0;
      while (true) {
        int b = fsin.read();
        // end of file:
        if (b == -1) {
          return false;
        }
        // save to buffer:
        if (withinBlock) {
          buffer.write(b);
        }

        // check if we're matching:
        if (b == match[i]) {
          i++;
          if (i >= match.length) {
            return true;
          }
        } else {
          i = 0;
        }
        // see if we've passed the stop point:
        if (!withinBlock && i == 0 && fsin.getPos() >= end) {
          return false;
        }
      }
    }

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        System.out.println("in getCurrentKey");
      return currentKey;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        System.out.println("in getCurrentValue");
      return currentValue;
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        System.out.println("in Initialize");
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        System.out.println("in nextKeyValue method");
      currentKey = new LongWritable();
      currentValue = new Text();
      return next(currentKey, currentValue);
    }
  }
}

输入文件：http://pastebin.com/kVcJgVHH

输出：

start of program
15/03/18 17:25:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Interval data processing Wed Mar 18 17:25:30 IST 2015
2 files received in this interval
15/03/18 17:25:33 ERROR TaskSetManager: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable; not retrying
15/03/18 17:25:33 ERROR JobScheduler: Error running job streaming job 1426679730000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
        at scala.Option.foreach(Option.scala:236)
        at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420)
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
        at akka.actor.ActorCell.invoke(ActorCell.scala:456)
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
        at akka.dispatch.Mailbox.run(Mailbox.scala:219)
        at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
        at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
        at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
        at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
        at scala.Option.foreach(Option.scala:236)
        at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420)
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
        at akka.actor.ActorCell.invoke(ActorCell.scala:456)
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
        at akka.dispatch.Mailbox.run(Mailbox.scala:219)
        at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
        at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
        at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
        at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

Answer 1

So LongWritable（以及其他Hadoop可写类型）不是Java可序列化的。解决方案是立即执行映射并将Writable转换为本机Java类型（例如，在LongWritable上调用get，在Text上调用toString等）。

Spark Streaming with custom inputformat - not serializable result：org.apache.hadoop.io.LongWritable

1 个答案: