使用xmlInputFormat的Spark流识别输入文件中的xml记录。
但是火花工作者因org.apache.hadoop.io.LongWritable可序列化错误而失败
object XmlStreaming {
def main(args: Array[String]) {
println("start of program");
if (args.length < 1) {
System.err.println("Usage: XmlStreaming <directory>")
System.exit(1)
}
val sparkConf = new SparkConf().setAppName("XmlStreaming")
val sc = new SparkContext(sparkConf);
val ssc = new StreamingContext(sc, Seconds(30));
val windowDStream = ssc.fileStream[LongWritable, Text, XmlInputFormat](args(0), (x: Path) => true, true);
windowDStream.foreachRDD { IncomingFiles =>
println("Interval data processing "+Calendar.getInstance().getTime());
if(IncomingFiles.count() == 0) {
println("No files received in this interval")
} else {
// println("1st line:"+IncomingFiles.take(5).deep.mkString("\n"));
println(IncomingFiles.count() + " files received in this interval");
IncomingFiles.collect().foreach(println);
println("end of processing");
}
}
ssc.start()
ssc.awaitTermination()
}
}
XmlInputFormat
输入xml文件开始标记和结束标记是硬编码的。
public class XmlInputFormat extends TextInputFormat {
private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class);
// public static final String START_TAG_KEY = "xmlinput.start";
// public static final String END_TAG_KEY = "xmlinput.end";
public static final String START_TAG_KEY = "<catalog>";
public static final String END_TAG_KEY = "</catalog>";
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
try {
return new XmlRecordReader((FileSplit) split, context.getConfiguration());
} catch (IOException ioe) {
log.warn("Error while creating XmlRecordReader", ioe);
return null;
}
}
/**
* XMLRecordReader class to read through a given xml document to output xml blocks as records as specified
* by the start tag and end tag
*
*/
public static class XmlRecordReader extends RecordReader<LongWritable, Text> {
private final byte[] startTag;
private final byte[] endTag;
private final long start;
private final long end;
private final FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable currentKey;
private Text currentValue;
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
// startTag = conf.get(START_TAG_KEY).getBytes(StandardCharsets.UTF_8);
// endTag = conf.get(END_TAG_KEY).getBytes(StandardCharsets.UTF_8);
System.out.println("in XmlRecordReader method");
startTag = "<catalog>".getBytes(StandardCharsets.UTF_8);
endTag = "</catalog>".getBytes(StandardCharsets.UTF_8);
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
private boolean next(LongWritable key, Text value) throws IOException {
System.out.println("in next method");
if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0, buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
return false;
}
@Override
public void close() throws IOException {
fsin.close();
}
@Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
System.out.println("in readUntilMatch method");
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1) {
return false;
}
// save to buffer:
if (withinBlock) {
buffer.write(b);
}
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length) {
return true;
}
} else {
i = 0;
}
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end) {
return false;
}
}
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
System.out.println("in getCurrentKey");
return currentKey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
System.out.println("in getCurrentValue");
return currentValue;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
System.out.println("in Initialize");
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println("in nextKeyValue method");
currentKey = new LongWritable();
currentValue = new Text();
return next(currentKey, currentValue);
}
}
}
输入文件:http://pastebin.com/kVcJgVHH
输出:
start of program
15/03/18 17:25:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Interval data processing Wed Mar 18 17:25:30 IST 2015
2 files received in this interval
15/03/18 17:25:33 ERROR TaskSetManager: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable; not retrying
15/03/18 17:25:33 ERROR JobScheduler: Error running job streaming job 1426679730000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 2.0 (TID 2) had a not serializable result: org.apache.hadoop.io.LongWritable
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
答案 0 :(得分:2)
So LongWritable(以及其他Hadoop可写类型)不是Java可序列化的。解决方案是立即执行映射并将Writable转换为本机Java类型(例如,在LongWritable上调用get,在Text上调用toString等)。