我有一个Spark程序,当我尝试运行此程序时,它会将cobol抄写本数据转换为XML,但遇到类似java.io.EOFException
的错误,我在Google上进行了很多搜索,但没有获得太多有用的链接。
我有一个示例文件,该文件是平面文件(.bin文件),并且能够从抄写本格式轻松转换为xml。但是我还有另一个文件是平面文件
cobol Copybook文件保存在file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/edited1.cbl
file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/xyz_Input
使用平面文件运行程序时出现错误
错误是这样
20:37:29.653 ERROR org.apache.spark.scheduler.TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.io.EOFException
at java.io.DataInputStream.readFully(Unknown Source)
at java.io.DataInputStream.readFully(Unknown Source)
at org.apache.spark.input.FixedLengthBinaryRecordReader.nextKeyValue(FixedLengthBinaryRecordReader.scala:118)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:207)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(generated.java:41)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:63)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1533)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1521)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1520)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1520)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1748)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1703)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1692)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2439)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2438)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2846)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2845)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2438)
at za.co.absa.cobrix.spark.cobol.examples.CobolSparkExample1$.main(CobolSparkExample1.scala:54)
at za.co.absa.cobrix.spark.cobol.examples.CobolSparkExample1.main(CobolSparkExample1.scala)
Caused by: java.io.EOFException
at java.io.DataInputStream.readFully(Unknown Source)
at java.io.DataInputStream.readFully(Unknown Source)
at org.apache.spark.input.FixedLengthBinaryRecordReader.nextKeyValue(FixedLengthBinaryRecordReader.scala:118)
at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:207)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(generated.java:41)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:63)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
package za.co.absa.cobrix.spark.cobol.examples
import org.apache.spark.sql.{SaveMode, SparkSession}
import za.co.absa.cobrix.spark.cobol.utils.SparkUtils
import com.databricks.spark.xml._
object CobolSparkExample1 {
def main(args: Array[String]): Unit = {
val sparkBuilder = SparkSession.builder().appName("Cobol source reader example 1")
val spark = sparkBuilder
.master("local[*]")
.getOrCreate()
val df = spark
.read
.format("za.co.absa.cobrix.spark.cobol.source")
.option("copybook", "file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/edited1.cbl")
.load("file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/xyz_Input")
df.printSchema()
println(df.count)
df.write
.format("com.databricks.spark.xml")
//.option(rowTag = "book", rootTag ="book")
//.option("book", "book")
.save("file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/Output/newbooks.xml")
}
}
输入源文件就是这样
ACHFOUND 079675882 1446320661365001Y00000 000M 8019721193 ACHFOUND 6613-144632 000875 <EOR>
ACHFOUND 079675882 1446320661365001Y10000 S10 ACHFOUND 875079675882 144632 11180524180525SAFEWAY 21130 8019721193 ACHFOUND 1805241300000000000087500000000180524144632 BTALBERTSONS COMPANIES, LLC 9 0091372096500NATIONAL SERVICES CENTER P.O. BOX 29093 PHOENIX AZ85038 STALBERTSONS LLC A SUB. OF ALBERTSONS COMPANIES, LLC 9 0091372096613 <EOR>