作业由于阶段失败而中止:阶段0.0中的任务0失败1次,最近一次失败:SPARK中的java.io.EOFException

时间:2019-01-08 15:29:03

标签: scala apache-spark dataframe split transformation

我有一个Spark程序,当我尝试运行此程序时,它会将cobol抄写本数据转换为XML,但遇到类似java.io.EOFException的错误,我在Google上进行了很多搜索,但没有获得太多有用的链接。

我有一个示例文件,该文件是平面文件(.bin文件),并且能够从抄写本格式轻松转换为xml。但是我还有另一个文件是平面文件

cobol Copybook文件保存在file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/edited1.cbl

file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/xyz_Input

使用平面文件运行程序时出现错误

错误是这样

20:37:29.653 ERROR org.apache.spark.scheduler.TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.io.EOFException
    at java.io.DataInputStream.readFully(Unknown Source)
    at java.io.DataInputStream.readFully(Unknown Source)
    at org.apache.spark.input.FixedLengthBinaryRecordReader.nextKeyValue(FixedLengthBinaryRecordReader.scala:118)
    at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:207)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(generated.java:41)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:63)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1533)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1521)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1520)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1520)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1748)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1703)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1692)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
    at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2439)
    at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2438)
    at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2846)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2845)
    at org.apache.spark.sql.Dataset.count(Dataset.scala:2438)
    at za.co.absa.cobrix.spark.cobol.examples.CobolSparkExample1$.main(CobolSparkExample1.scala:54)
    at za.co.absa.cobrix.spark.cobol.examples.CobolSparkExample1.main(CobolSparkExample1.scala)
Caused by: java.io.EOFException
    at java.io.DataInputStream.readFully(Unknown Source)
    at java.io.DataInputStream.readFully(Unknown Source)
    at org.apache.spark.input.FixedLengthBinaryRecordReader.nextKeyValue(FixedLengthBinaryRecordReader.scala:118)
    at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:207)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(generated.java:41)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(generated.java:63)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)


package za.co.absa.cobrix.spark.cobol.examples

import org.apache.spark.sql.{SaveMode, SparkSession}
import za.co.absa.cobrix.spark.cobol.utils.SparkUtils
import com.databricks.spark.xml._

object CobolSparkExample1 {

  def main(args: Array[String]): Unit = {

    val sparkBuilder = SparkSession.builder().appName("Cobol source reader example 1")
    val spark = sparkBuilder
      .master("local[*]")
      .getOrCreate()

       val df = spark
      .read
      .format("za.co.absa.cobrix.spark.cobol.source")

      .option("copybook", "file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/edited1.cbl")
      .load("file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/xyz_Input")


      df.printSchema()

    println(df.count)

       df.write
      .format("com.databricks.spark.xml") 
      //.option(rowTag = "book", rootTag ="book")
      //.option("book", "book")
      .save("file:///Users/admin/Desktop/Kafka/cobrix-master/cobrix-master/examples/example_data/Output/newbooks.xml")


  }

}

输入源文件就是这样

ACHFOUND    079675882    1446320661365001Y00000   000M   8019721193     ACHFOUND       6613-144632                                                                                                                                                                                                                                                             000875                                                                                                                                                                                                                                 <EOR>
ACHFOUND    079675882    1446320661365001Y10000   S10    ACHFOUND    875079675882    144632                11180524180525SAFEWAY   21130     8019721193  ACHFOUND    1805241300000000000087500000000180524144632                BTALBERTSONS COMPANIES, LLC          9 0091372096500NATIONAL SERVICES CENTER           P.O. BOX 29093                     PHOENIX            AZ85038    STALBERTSONS LLC A SUB. OF ALBERTSONS COMPANIES, LLC          9 0091372096613                                                                                                                                 <EOR>

0 个答案:

没有答案